# Download and unzip the latest GENCODE transcripts reference sequences:
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_28/gencode.v28.transcripts.fa.gz
gunzip gencode.v28.transcripts.fa.gz
# Substitute "|" symbols in the FASTA headers with "#" symbols
sed 's/|/#/g' gencode.v28.transcripts.fa > gencode.v28.transcripts_#.fa
mv gencode.v28.transcripts_#.fa gencode.v28.transcripts.fa
# Extract sequences corresponding to different RNA classes
faFilter -name=*protein_coding* gencode.v28.transcripts.fa mRNA_GENCODE.fa # protein coding mRNA
faFilter -name=*#rRNA#* gencode.v28.transcripts.fa rRNA_GENCODE.fa # Ribosomal RNA
faFilter -name=*#snoRNA#* gencode.v28.transcripts.fa snoRNA_GENCODE.fa # Small nucleolar RNA
faFilter -name=*#scaRNA#* gencode.v28.transcripts.fa scaRNA_GENCODE.fa # Small Cajal body-specific RNA
faFilter -name=*#snRNA#* gencode.v28.transcripts.fa snRNA_GENCODE.fa # Small nuclear RNA
faFilter -name=*RN7S* gencode.v28.transcripts.fa RN7S_GENCODE.fa # Signal recognition particle RNA
faFilter -name=*miRNA* gencode.v28.transcripts.fa premiRNA_GENCODE.fa # pre-microRNA hairpins
faFilter -name=*RNY* gencode.v28.transcripts.fa RNY_GENCODE.fa # Ro-associated Y RNA
faFilter -name=*VTRNA* gencode.v28.transcripts.fa VTRNA_GENCODE.fa # Vault RNA
faFilter -name=*lincRNA* gencode.v28.transcripts.fa lincRNA_GENCODE.fa # Long intervening noncoding RNAs
faFilter -name=*#IG_#* gencode.v28.transcripts.fa IG_VDJC_genes_GENCODE.fa # Immunoglobulin V, D, J and C regions
faFilter -name=*#TR_#* gencode.v28.transcripts.fa TR_VDJC_genes_GENCODE.fa # T-cell receptor genes
# Download and unzip the latest Ensembl cDNA and non-coding RNA reference sequences
wget ftp://ftp.ensembl.org/pub/release-93/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz
wget ftp://ftp.ensembl.org/pub/release-93/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
gunzip Homo_sapiens.GRCh38.ncrna.fa.gz
gunzip Homo_sapiens.GRCh38.cdna.all.fa.gz
# Substitute spaces in the FASTA headers with "#" symbols
sed 's/ /#/g' Homo_sapiens.GRCh38.ncrna.fa > Homo_sapiens.GRCh38.ncrna_#.fa
sed 's/ /#/g' Homo_sapiens.GRCh38.cdna.all.fa > Homo_sapiens.GRCh38.cdna.all_#.fa
mv Homo_sapiens.GRCh38.ncrna_#.fa Homo_sapiens.GRCh38.ncrna.fa
mv Homo_sapiens.GRCh38.cdna.all_#.fa Homo_sapiens.GRCh38.cdna.all.fa
# Extract sequences corresponding to different RNA classes
faFilter -name=*gene_biotype:protein_coding#transcript_biotype:protein_coding* Homo_sapiens.GRCh38.cdna.all.fa mRNA_Ensembl.fa # protein coding mRNA
faFilter -name=*#transcript_biotype:rRNA* Homo_sapiens.GRCh38.ncrna.fa rRNA_Ensembl.fa
faFilter -name=*#gene_biotype:snoRNA* Homo_sapiens.GRCh38.ncrna.fa snoRNA_Ensembl.fa
faFilter -name=*#gene_biotype:scaRNA* Homo_sapiens.GRCh38.ncrna.fa scaRNA_Ensembl.fa
faFilter -name=*#gene_biotype:snoRNA* Homo_sapiens.GRCh38.ncrna.fa RNU_Ensembl.fa
faFilter -name=*#gene_biotype:miRNA* Homo_sapiens.GRCh38.ncrna.fa miRNA_Ensemble.fa
faFilter -name=*#gene_symbol:RN7S* Homo_sapiens.GRCh38.ncrna.fa RN7S_Ensembl.fa
faFilter -name=*#gene_symbol:RNY* Homo_sapiens.GRCh38.ncrna.fa RNY_Ensembl.fa
faFilter -name=*#gene_symbol:VTRNA* Homo_sapiens.GRCh38.ncrna.fa vaultRNA_Ensembl.fa
faFilter -name=*#gene_biotype:lincRNA* Homo_sapiens.GRCh38.ncrna.fa lincRNAg_Ensemble.fa
faFilter -name=*#transcript_biotype:lincRNA* Homo_sapiens.GRCh38.ncrna.fa lincRNAt_Ensemble.fa
cat lincRNAg_Ensemble.fa lincRNAt_Ensemble.fa > lincRNA_Ensemble.fa
faFilter -name=*#gene_biotype:IG_* Homo_sapiens.GRCh38.ncrna.fa IG_VDJC_genes_Ensembl.fa
faFilter -name=*#gene_biotype:TR_* Homo_sapiens.GRCh38.ncrna.fa TR_VDJC_genes_Ensembl.fa
Enter UCSC Table Browser:
https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=686006651_J359L66i9WtAtipTVQdfVUrkPqZa&clade=mammal&org=Human&db=hg38&hgta_group=genes&hgta_track=refSeqComposite&hgta_table=refGene&hgta_regionType=genome&position=chr1%3A11102837-11267747&hgta_outputType=primaryTable&hgta_outFileName=
Download and hg38 transcripts references:
Group:Genes_and_Gene_Predictions -> track:NCBI RefSeq -> table:UCSC RefSeq (refGene) -> output format:sequence -> genomic -> output file: UCSC_RefSeq_transcripts.fa
Download and the gene names for the transcripts references:
Group:Genes_and_Gene_Predictions -> Track: NCBI RefSeq -> Table: UCSC RefSeq (refGene) -> output format: selected fields -> from primary and related tables -> check "name" and "name2" boxes -> output file: UCSC_RefSeq_transcripts_names
# Remove duplicated sequences from both UCSC_RefSeq_transcripts.fa and UCSC_RefSeq_transcripts_names:
faFilter -uniq UCSC_RefSeq_transcripts.fa UCSC_RefSeq_transcripts_noDup.fa
mv UCSC_RefSeq_transcripts_noDup.fa UCSC_RefSeq_transcripts.fa
awk '!seen[$0]++' UCSC_RefSeq_transcripts_names > UCSC_RefSeq_transcripts_names_noDup
mv UCSC_RefSeq_transcripts_names_noDup UCSC_RefSeq_transcripts_names
# Clean the headers in UCSC_RefSeq_transcripts.fa
sed -i -e 's/hg38_refGene_N/N/g' UCSC_RefSeq_transcripts.fa
# Extract protein coding and non-coding RNAs from UCSC_RefSeq_transcripts.fa
faFilter -name=*NM* UCSC_RefSeq_transcripts.fa mRNA_ucsc.fa #All protein coding transcripts
faFilter -name=*NR* UCSC_RefSeq_transcripts.fa ncRNA_ucsc.fa #All non-coding transcripts
# Extract different non-coding RNA classes from UCSC_RefSeq_transcripts.fa
faFilter -namePatList=VTRNA_ucsc.txt UCSC_RefSeq_transcripts.fa VTRNA_ucsc.fa
faFilter -namePatList=snoRNA_ucsc.txt UCSC_RefSeq_transcripts.fa snoRNA_ucsc.fa
faFilter -namePatList=scaRNA_ucsc.txt UCSC_RefSeq_transcripts.fa scaRNA_ucsc.fa
faFilter -namePatList=rRNA_ucsc.txt UCSC_RefSeq_transcripts.fa rRNA_ucsc.fa
faFilter -namePatList=RNY_ucsc.txt UCSC_RefSeq_transcripts.fa RNY_ucsc.fa
faFilter -namePatList=snRNA_ucsc.txt UCSC_RefSeq_transcripts.fa snRNA_ucsc.fa
faFilter -namePatList=RNA7S_ucsc.txt UCSC_RefSeq_transcripts.fa RN7S_ucsc.fa
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/VTRNA_ucsc.txt
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/snoRNA_ucsc.txt
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/scaRNA_ucsc.txt
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/rRNA_ucsc.txt
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/RNY_ucsc.txt
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/snRNA_ucsc.txt
wget https://bioinformatics.sciberg.com/wp-content/uploads/2018/08/RNA7S_ucsc.txt
Enter UCSC Table Browser:
https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=686006651_J359L66i9WtAtipTVQdfVUrkPqZa&clade=mammal&org=Human&db=hg38&hgta_group=genes&hgta_track=tRNAs&hgta_table=0&hgta_regionType=genome&position=chr1%3A11102837-11267747&hgta_outputType=primaryTable&hgta_outFileName=
Download hg38 tRNA references:
Group: Genes_and_Gene_Predictions -> Track: tRNA genes -> Table: tRNAs -> output format: sequence -> output file: USCS_tRNA.fa
# Download and unzip miRNA and premiRNA references:
wget ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz
wget ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz
gunzip mature.fa.gz
gunzip hairpin.fa.gz
# Extract human only miRNA and premiRNA:
faFilter -name=hsa-* mature.fa hsa_miRNA_temp.fa
faFilter -name=hsa-* hairpin.fa hsa_premiRNA_temp.fa
# Change RNA into DNA sequences:
sed 's/U/T/g' hsa_miRNA_temp.fa > hsa_miRNA.fa
sed 's/U/T/g' hsa_premiRNA_temp.fa > hsa_premiRNA.fa