16.6.16

Convert PLINK to fastPHASE with BASH and GNU Parallel

Problem: The converter built in to PLINK converts nucleotide encoded data to a 0/1 binary format automatically when using --recode-fastphase to create a fastPHASE input file. This causes irretrievable loss of information.

Solution:  I can't find any way around this other than to make a new converter.
1. Using PLINK, recode .ped file such that each chromosome has its own .ped/.map set of files.  Something like:
./plink --file mydata --chr 1 --recode --out malc1;
./plink --file mydata --chr 2 --recode --out malc2;

2. Use the following script to create fastPHASE input files. Enter your values for 'nchr', 'iroot', and 'oroot' after the function dbl().
dbl() {
i=$1;
f="$infile"".ped";
nrow=$(wc -l "$f" | awk '{print $1}');
echo "splitting row $i/$nrow";

# split lines containing genotypes, by allele, for each individual
#odd alleles
odd=$(sed -n $i"p" "$f" | tr ' ' '\n' | tail -n +7 | sed -n 'p;n' | tr '\n' ' ' | sed 's/ $//g');
#even alleles
even=$(sed -n $i"p" "$f" | tr ' ' '\n' | tail -n +7 | sed -n 'g;n;p' | tr '\n' ' ' | sed 's/ $//g');
echo "$odd" > "$i.tmp";
echo "$even" >> "$i.tmp";
}
export -f dbl;

nchr=17; #number of chromosomes
iroot="malc"; #root string of input files for each chromosome
oroot="chr"; #root string for output files, in fastPHASE format
for ((i=1;i<=$nchr;i++));
  do echo "chr"$i;
    export infile="$iroot""$i";
    h1=$(wc -l "$infile".ped | awk '{print $1}'); #n samples for fastPHASE header line 1
    h2a=$(head -1 "$infile".ped | awk '{print NF}');
    h2=$((($h2a-6)/2)); #n loci for fastPHASE header line 2
    h3a=$(awk '{print $4}' "$infile".map);
    h3=$(echo "P $h3a" | tr "\n" " "); #locus positions for fastPHASE header line 3
    s=$(awk '{print $2}' "$infile".ped);
    
    #split single line per sample plink format into two line per sample fastPHASE format
    seq 1 $h1 | parallel --env dbl --env infile dbl;

    #Reassemble into a single file:
    outfile="$oroot""$i".inp;
    echo "$h1" > "$outfile";
    echo "$h2" >> "$outfile";
    echo "$h3" >> "$outfile";
    for ((j=1;j<=$h1;j++));
      do sample=$(echo "$s" | sed -n $j"p")" ";
        echo "writing chr $i sample $sample $j/$h1";
        echo '>'"$sample" >> "$outfile";
        head -1 "$j.tmp" | sed 's/0/?/g' >> "$outfile";
        tail -n +2 "$j.tmp" | sed 's/0/?/g' >> "$outfile";
        rm "$j.tmp";
       done;
  done;

3. Phase away.

1 comment:

  1. dbl() {
    i=1;
    f=final4_"$i".ped;
    nrow=$(wc -l "$f" | awk '{print $1}');
    echo "splitting row $i/$nrow";

    # split lines containing genotypes, by allele, for each individual
    #odd alleles
    odd=$(sed -n $i"p" "$f" | tr ' ' '\n' | tail -n +7 | sed -n 'p;n' | tr '\n' ' ' | sed 's/ $//g');
    #even alleles
    even=$(sed -n $i"p" "$f" | tr ' ' '\n' | tail -n +7 | sed -n 'g;n;p' | tr '\n' ' ' | sed 's/ $//g');
    echo "$odd" > "$i.tmp";
    echo "$even" >> "$i.tmp";
    }
    export -f dbl;

    nchr=4; #number of chromosomes
    iroot="/home/cuser/Ankita-2017/shapeit/shapeit.v2.904.3.10.0-693.11.6.el7.x86_64/final4_"; #root string of input files for each chromosome
    oroot="/home/cuser/Ankita-2017/shapeit/shapeit.v2.904.3.10.0-693.11.6.el7.x86_64/final4_"; #root string for output files, in fastPHASE format
    for ((i=1;i<=$nchr;i++)); do
    echo "chr"$i;
    done
    export infile="$iroot""$i";
    h1=$(wc -l final4_"$i".ped | awk '{print $1}'); #n samples for fastPHASE header line 1
    h2a=$(head -1 final4_"$i".ped | awk '{print NF}');
    h2=$((($h2a-6)/2)); #n loci for fastPHASE header line 2
    h3a=$(awk '{print $4}' final4_"$i".map);
    h3=$(echo "P $h3a" | tr "\n" " "); #locus positions for fastPHASE header line 3
    s=$(awk '{print $2}' final4_"$i".ped);

    #split single line per sample plink format into two line per sample fastPHASE format
    seq 1 $h1 | parallel --env dbl --env final4_"$i".ped dbl;
    #seq 1..4 $h1 | dbl final4_"$i".ped dbl;


    #Reassemble into a single file:
    outfile="$oroot""$i".inp;
    echo "$h1" > "$outfile";
    echo "$h2" >> "$outfile";
    echo "$h3" >> "$outfile";
    for ((j=1;j<=$h1;j++));
    do sample=$(echo "$s" | sed -n $j"p")" ";
    echo "writing chr $i sample $sample $j/$h1";
    echo '>'"$sample" >> "$outfile";
    head -1 "$j.tmp" | sed 's/0/?/g' >> "$outfile";
    tail -n +2 "$j.tmp" | sed 's/0/?/g' >> "$outfile";
    rm "$j.tmp";
    done;
    done;




    ####I ran the above code and getting the following error.

    writing chr 5 sample SG1-3 1/165
    writing chr 5 sample SG1-14 2/165
    head: cannot open '2.tmp' for reading: No such file or directory
    tail: cannot open '2.tmp' for reading: No such file or directory
    rm: cannot remove '2.tmp': No such file or directory


    can you please help

    ReplyDelete