Advanced Bioinformatics Services

Extracting promoter and TSS sequences from a genome FASTA

# Download and install bedtools:

wget https://github.com/arq5x/bedtools2/archive/v2.28.0.zip

unzip bedtools2-2.28.0.zip

cd bedtools2-2.28.0

make

# Copy bedtools binaries to your $PATH (e.g. /usr/local/bin):

cd bin

sudo cp bedtools* /usr/local/bin/

# Generate FASTA file with 2000 nt upstream of TSSs of every genes in human genome using corresponding bed file:

samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa

cut -f 1,2 Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai > chrom.sizes

bedtools flank -i Homo_sapiens.GRCh38.93.gene.bed -g chrom.sizes -l 2000 -r 0 -s > Homo_sapiens.GRCh38.93.gene_2000up.bed

bedtools getfasta -fi Homo_sapiens.GRCh38.dna.primary_assembly.fa -bed Homo_sapiens.GRCh38.93.gene_2000up.bed -fo Homo_sapiens.GRCh38.93.gene_2000up.fa

# Generate FASTA file with 1000 nt upstream and 1000 nt downstream of TSS of every genes in human genome using corresponding bed file:

samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa

cut -f 1,2 Homo_sapiens.GRCh38.dna.primary_assembly.fa.fai > chrom.sizes

bedtools flank -i Homo_sapiens.GRCh38.93.gene.bed -g chrom.sizes -l 1000 -r 0 -s > temp.bed

bedtools slop -i temp.bed -g chrom.sizes -l 0 -r 1000 -s > Homo_sapiens.GRCh38.93.gene_TSSs.bed

bedtools getfasta -fi Homo_sapiens.GRCh38.dna.primary_assembly.fa -bed Homo_sapiens.GRCh38.93.gene_TSSs.bed -fo Homo_sapiens.GRCh38.93.gene_TSSs.fa