Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-ports-kde
Path: blob/main/biology/ddocent/files/ddocent-assembly-test
16461 views
#!/usr/bin/env bash

set -e

##########################################################################
#   Function description:
#       Pause until user presses return
##########################################################################

pause()
{
    local junk
    
    printf "Press return to continue..."
    read junk
}


##########################################################################
#   Not necessary if invoking scripts with "bash script-name" as shown
#   in the dDocent tutorial, but supplied for completeness.  Tutorial
#   scripts have also been patched upstream to usr /usr/bin/env, but
#   we won't assume they'll all stay that way.
##########################################################################

fix_bash_path()
{
    # Don't echo these commands
    { set +x; } 2>/dev/null
    if [ $# != 1 ]; then
	printf "Usage: $0 script\n"
	exit 1
    fi
    sed -i.bak -e "s|#!/bin/bash|#!/usr/bin/env bash|g" $1
    set -x
}


##########################################################################
#   Main
##########################################################################

##########################################################################
#   Workarounds for running dDocent from FreeBSD ports and Pkgsrc.
#   Include this section in any scripts running dDocent commands.
##########################################################################

# Hack to allow remake_reference.sh, etc. to find trimmomatic.
# trimmomatic.jar and adapter files are not an executables and should not
# be in PATH according to filesystem hierarchy standards, but the dDocent
# scripts are coded to look for them there, because that's how dDocent
# installs the bundled Trimmomatic.
if [ -e /usr/local/share/java/classes/trimmomatic.jar ]; then
    export PATH=${PATH}:/usr/local/share/java/classes:/usr/local/share/trimmomatic/adapters
elif [ -e $PKGSRC/lib/java/trimmomatic.jar ]; then
    export PATH=${PATH}:$PKGSRC/lib/java:$PKGSRC/share/trimmomatic/adapters
else
    printf "Error: Trimmomatic is not installed in a known location.\n" >> /dev/stderr
fi

if ! pwd | fgrep dDocent-test; then
    mkdir -p dDocent-test
    cd dDocent-test
fi

# remake_reference.sh and some other scripts use GNU extensions in awk
# commands.  Trick systems into using gawk to get around this without
# having to patch every script.
mkdir -p ddocent-bin
ln -sf `which gawk` ddocent-bin/awk
ln -sf `which python2.7` ddocent-bin/python
export PATH=`pwd`/ddocent-bin:$PATH

##########################################################################
#   Actual dDocent commands below
##########################################################################

##########################################################################
#   Command sequence from the dDocent tutorial on Github.  See link below.
##########################################################################

cat << EOM

This script runs the test commands described at

http://ddocent.com/assembly

You may want to follow along on this web page as the script runs.  It will
pause after each step to allow checking the output.

EOM
pause

mkdir -p Data
cd Data

printf "Downloading and unpacking data.zip...\n"
if [ -e data.zip ]; then
    mv data.zip /tmp/dDocent-data.zip
    rm -rf *
    mv /tmp/dDocent-data.zip data.zip
else
    rm -rf *
    curl --insecure -L -o data.zip \
	'https://www.dropbox.com/s/t09xjuudev4de72/data.zip?dl=0'
fi

# Hack: current data.zip contains data.zip.  ??!
if [ ! -e simRRLs2.py ]; then
    yes | unzip data.zip || true
fi
ls -l
pause

set -x
head SimRAD.barcodes
{ set +x; } 2>/dev/null
pause

set -x
cut -f2 SimRAD.barcodes > barcodes
head barcodes
{ set +x; } 2>/dev/null
pause

set -x
process_radtags -1 SimRAD_R1.fastq.gz -2 SimRAD_R2.fastq.gz -b barcodes \
    -e ecoRI --renz_2 mspI -r -i gzfastq
ls | fgrep sample_ | head
{ set +x; } 2>/dev/null
pause

set -x
rm *rem*
{ set +x; } 2>/dev/null
pause

{ set +x; } 2>/dev/null
pause

set -x
Rename_for_dDocent.sh SimRAD.barcodes
{ set +x; } 2>/dev/null

set -x
ls *.fq.gz
{ set +x; } 2>/dev/null
pause

set -x
ls *.F.fq.gz > namelist
sed -i'' -e 's/.F.fq.gz//g' namelist
AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
AWK2='!/>/'
AWK3='!/NNN/'
PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'

cat namelist | parallel --no-notice -j 8 "zcat {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
cat namelist | parallel --no-notice -j 8 "zcat {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
cat namelist | parallel --no-notice -j 8 "paste -d '-' {}.forward {}.reverse | mawk '$AWK3' | sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
{ set +x; } 2>/dev/null
pause

set -x
cat *.uniq.seqs > uniq.seqs
for i in {2..20};
do 
echo $i >> pfile
done
cat pfile | parallel --no-notice \
    "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniq.seqs |  wc -l" \
    | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.data
rm pfile
{ set +x; } 2>/dev/null
pause

set -x
more uniqseq.data
{ set +x; } 2>/dev/null
pause

set -x
gnuplot << \EOF 
set terminal dumb size 80, 30
set autoscale
set xrange [2:20] 
unset label
set title "Number of Unique Sequences with More than X Coverage (Counted within individuals)"
set xlabel "Coverage"
set ylabel "Number of Unique Sequences"
plot 'uniqseq.data' with lines notitle
pause -1
EOF
{ set +x; } 2>/dev/null
pause

set -x
parallel --no-notice -j 8 mawk -v x=4 \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
wc -l uniqCperindv
{ set +x; } 2>/dev/null
pause

set -x
for ((i = 2; i <= 10; i++));
do
echo $i >> ufile
done

cat ufile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniqCperindv | wc -l" | mawk  '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.peri.data
rm ufile
{ set +x; } 2>/dev/null
pause

set -x
gnuplot << \EOF 
set terminal dumb size 80, 30
set autoscale 
unset label
set title "Number of Unique Sequences present in more than X Individuals"
set xlabel "Number of Individuals"
set ylabel "Number of Unique Sequences"
plot 'uniqseq.peri.data' with lines notitle
pause -1
EOF
{ set +x; } 2>/dev/null
pause

set -x
mawk -v x=4 '$1 >= x' uniqCperindv > uniq.k.4.c.4.seqs
wc -l uniq.k.4.c.4.seqs
{ set +x; } 2>/dev/null
pause

set -x
cut -f2 uniq.k.4.c.4.seqs > totaluniqseq
mawk '{c= c + 1; print ">Contig_" c "\n" $1}' totaluniqseq > uniq.fasta
{ set +x; } 2>/dev/null
pause

set -x
sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
{ set +x; } 2>/dev/null
pause

set -x
cd-hit-est -i uniq.F.fasta -o xxx -c 0.8 -T 0 -M 0 -g 1
{ set +x; } 2>/dev/null
pause

set -x
mawk '{if ($1 ~ /Cl/) clus = clus + 1; else  print $3 "\t" clus}' xxx.clstr | sed 's/[>Contig_,...]//g' | sort -g -k1 > sort.contig.cluster.ids
paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
sort -k2,2 -g contig.cluster.totaluniqseq | sed -e 's/NNNNNNNNNN/\t/g' > rcluster
{ set +x; } 2>/dev/null
pause

set -x
head rcluster
{ set +x; } 2>/dev/null
pause

set -x
cut -f2 rcluster | uniq | wc -l
{ set +x; } 2>/dev/null
pause

set -x
rainbow div -i rcluster -o rbdiv.out
{ set +x; } 2>/dev/null
pause

set -x
rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
{ set +x; } 2>/dev/null
pause

set -x
rainbow merge -o rbasm.out -a -i rbdiv.out
{ set +x; } 2>/dev/null
pause

set -x
rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
{ set +x; } 2>/dev/null
pause

# If missing fdesc mount for bash, <(echo "E") will not work
# cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
set -x
echo "E" > endfile
cat rbasm.out endfile |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
if (NR == 1) e=$2;
else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
else if ($1 ~/C/) clus=$2;
else if ($1 ~/L/) len=$2;
else if ($1 ~/S/) seq=$2;
else if ($1 ~/N/) freq=$2;
else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
}' > rainbow.fasta
{ set +x; } 2>/dev/null
pause

set -x
cd-hit-est -i rainbow.fasta -o referenceRC.fasta -M 0 -T 0 -c 0.9
{ set +x; } 2>/dev/null
pause

remake_reference.sh 4 4 0.90 PE 2
{ set +x; } 2>/dev/null
pause

ReferenceOpt.sh

bash ReferenceOpt.sh 4 8 4 8 PE 16
{ set +x; } 2>/dev/null
pause

set -x
bash remake_reference.sh 4 4 0.90 PE 2
head reference.fasta
{ set +x; } 2>/dev/null
pause

set -x
wc -l reference.fasta
{ set +x; } 2>/dev/null
pause

set -x
mawk '/>/' reference.fasta | wc -l
{ set +x; } 2>/dev/null
pause

set -x
mawk '/^NAATT.*N*.*CCGN$/' reference.fasta | wc -l
grep '^NAATT.*N*.*CCGN$' reference.fasta | wc -l
{ set +x; } 2>/dev/null
pause

printf "Bonus Section: Optimize reference assemblies? (takes a long time) y/[n] "
read bonus
if [ 0$bonus = 0y ]; then
    set -x
    { set +x; } 2>/dev/null
    printf "Running dDocent to trim reads.\n"
    pause
    set -x
    cat << EOM | dDocent || true
yes
8
10g
yes
no
no
no
[email protected]
EOM
    RefMapOpt.sh 4 8 4 8 0.9 64 PE
    { set +x; } 2>/dev/null
    pause
    more mapping.results
    pause
fi