Advanced Bioinformatics Services

Manipulations with FASTQ and FASTA files in Linux Bash/Shell

# Counting number of reads in an uncompressed FASTQ file

echo $(cat fastq_file.fastq|wc -l)/4|bc

# Counting number of reads in gzip compressed FASTQ file

echo $(zcat fastq_file.fastq.gz|wc -l)/4|bc

# Examining certain lines within the FASTQ file (e.g, lines 530 to 640) | way 1

sed -n '530,640p;641q' fastq_file.fastq

# Examining certain lines within the FASTQ file (e.g, lines 530 to 640) | way 2

awk 'FNR>=530 && FNR<=540' fastq_file.fastq

# Converting FastQ to FastA | way 1

awk 'NR%4==1{a=substr($0,2);}NR%4==2{print ">"a"\n"$0}' input.fastq > output.fa

# Converting FastQ to FastA | way 2

sed '/^@/!d;s//>/;N' input.fastq > output.fa

# Extracting all reads containing XbaI cleavage site

awk 'NR%4==1{a=substr($0,2);}NR%4==2 && $1~/TCTAGA/ {print ">"a"\n"$0}' fastq_file.fastq

# Counting number of sequences in a FASTA file:

grep -c "^>" fasta_file.fa

# Extracting a FASTA header (e.g. to obtain a table with genes/transcripts annotation from a given reference):

grep -e ">" fasta.fa > fasta_header

# Cleaning up a FASTA header so that only the first column of the header remains:

awk '{print $1}' fasta_file_input.fa > fasta_file_output.fa

perl -p -i -e 's/>(.+?) .+/>$1/g' fasta_file.fa

# Converting a multi-line FASTA to a single-line FASTA:

awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' multi_line.fa > single_line.fa