Replace portion of fasta headers

Question:

I would like to replace a portion of the headers in a fasta file (surrounded by _) using a text file with a key.

#fasta file:
>mir-2_scf7180000350313_41896
CCATCAGAGTGGTTGTGATGTGGTGCTATTGATTCATATCACAGCCAGCTTTGATGAG
>mir-92a-2_scf7180000349939_17298
AGGTGGGGATGGGGGCAATATTTGTGAATGATTAAATTCAAATTGCACTTGTCCCGGCCTGC
>mir-279a_scf7180000350374_48557
AATGAGTGGCGGTCTAGTGCACGGTCGATAAAGTTGTGACTAGATCCACACTCATTAAG

#key_file.txt
scf7180000350313 NW_011929472.1
scf7180000349939 NW_011929473.1
scf7180000350374 NW_011929474.1

#expected result
>mir-2_NW_011929472.1_41896
CCATCAGAGTGGTTGTGATGTGGTGCTATTGATTCATATCACAGCCAGCTTTGATGAG
>mir-92a-2_NW_011929473.1_17298
AGGTGGGGATGGGGGCAATATTTGTGAATGATTAAATTCAAATTGCACTTGTCCCGGCCTGC
>mir-279a_NW_011929474.1_48557
AATGAGTGGCGGTCTAGTGCACGGTCGATAAAGTTGTGACTAGATCCACACTCATTAAG
Asked By: Amaranta_Remedios

||

Answers:

You can try this awk.

$ awk '
    NR == FNR{r[$1] = $2; next}      # read in keyword-replacement file in associative array
    /^>/{                            # for all lines beginning with >
      for(i in r){                   # cycle through the key values of the associative array
        n = sub(i, r[i], $0)         # do the replacement with i (key) and r[i] (value). That changes the line in memory. It's printed using "1" at the end of the block 
        if(n == 1){break}            # a performance-relevant line, assuring the for loop breaks once a key-value pair matched
      }
    }1' key_file.txt fasta-file
>mir-2_NW_011929472.1_41896
CCATCAGAGTGGTTGTGATGTGGTGCTATTGATTCATATCACAGCCAGCTTTGATGAG
>mir-92a-2_NW_011929473.1_17298
AGGTGGGGATGGGGGCAATATTTGTGAATGATTAAATTCAAATTGCACTTGTCCCGGCCTGC
>mir-279a_NW_011929474.1_48557
AATGAGTGGCGGTCTAGTGCACGGTCGATAAAGTTGTGACTAGATCCACACTCATTAAG
Answered By: Andre Wildberg

In base R, an approach among many, showing playing with cut and paste_data that works:

key_file <- read.table(file('stdin'))
scf7180000350313 NW_011929472.1
scf7180000349939 NW_011929473.1
scf7180000350374 NW_011929474.1
# ctrl-D for EOF unix

fasta_files <- read.table(file('stdin'))
mir-2_scf7180000350313_41896
mir-92a-2_scf7180000349939_17298
mir-279a_scf7180000350374_48557

fasta_mtx <- sapply(lapply(fasta_files, strsplit, split = '_')$V1, unlist)
fasta_mtx
     [,1]               [,2]               [,3]              
[1,] "mir-2"            "mir-92a-2"        "mir-279a"        
[2,] "scf7180000350313" "scf7180000349939" "scf7180000350374"
[3,] "41896"            "17298"            "4855

sprintf for paste multiple strings

sprintf('%s_%s_%s', fasta_mtx[1, ], key_file$V2, fasta_mtx[3, ])
[1] "mir-2_NW_011929472.1_41896"     "mir-92a-2_NW_011929473.1_17298"
[3] "mir-279a_NW_011929474.1_48557"
Answered By: Chris
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.