Introduction
This RNA-seq pipeline was built with HISAT2 + Stringtie + DESeq2 + clusterProfiler.
1st, Prepariation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
# Directory Deployment
mkdir -p ${HOME}/project/mRNA/{bin,data/{genes,genome,index,samples},output/{ballgown,hisat2,tmp}}
# Data link
cd ${HOME}/../Data_benthic/practice/mRNA
ln -s ${PWD}/Hdh.fa ${HOME}/project/mRNA/data/genome/
ln -s ${PWD}/Hdh.gff3 ${HOME}/project/mRNA/data/genes/
for i in `find ${PWD} -type f -name "*.fq.gz"`;
do
ln -s ${i} ${HOME}/project/mRNA/data/samples/;
done
|
2nd, Alignment
1
|
cd ${HOME}/project/mRNA/bin
|
Copy the codes below to 02alignment.pbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
#!/bin/bash -x
#PBS -N ref_mRNA_02alignment
#PBS -o ref_mRNA_02alignment.out
#PBS -e ref_mRNA_02alignment.err
#PBS -q high
#PBS -j oe
#PBS -l nodes=1:ppn=28
NUMCPUS=`wc -l <$PBS_NODEFILE`
THREADS=`expr ${NUMCPUS} \* 2`
BASEDIR="${HOME}/project/mRNA"
FASTQLOC="${BASEDIR}/data/samples"
GENOMEIDX="${BASEDIR}/data/index/Hdh_tran"
GTFFILE="${BASEDIR}/data/genes/Hdh.gtf"
GENOMEFA="${BASEDIR}/data/genome/Hdh.fa"
WRKDIR="${BASEDIR}/output"
ALIGNLOC="${WRKDIR}/hisat2"
BALLGOWNLOC="${WRKDIR}/ballgown"
TEMPLOC="${WRKDIR}/tmp"
# transfer gff to gtf
cd ${BASEDIR}/data/genes/
gffread Hdh.gff -T -o ${GTFFILE}
# Extract splice sites and exon info from gtf using the script under hisat_install_dir
cd ${BASEDIR}/data/index
python2 /public/tools/rna_seq/hisat2-2.1.0/extract_splice_sites.py ${GTFFILE} > Hdh.ss
python2 /public/tools/rna_seq/hisat2-2.1.0/extract_exons.py ${GTFFILE} > Hdh.exon
# Build genome index for hisat2
hisat2-build -p 50 --ss Hdh.ss --exon Hdh.exon ${GENOMEFA} Hdh_tran
## sample pair-end fastq must be named in this format “_1.*”"_2.*". eg, "gill01_1.fastq.gz","gill01_2.fastq.gz"
reads1=(${FASTQLOC}/*_1.*)
reads1=("${reads1[@]##*/}")
reads2=("${reads1[@]/_1./_2.}")
cd ${WRKDIR}
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: HISAT2 Alignment"
for ((i=0; i<=${#reads1[@]}-1; i++ )); do
sample="${reads1[$i]%%.*}"
sample="${sample%_*}"
stime=`date +"%Y-%m-%d %H:%M:%S"`
echo "[$stime] Processing sample: $sample"
echo [$stime] " * Alignment of reads to genome (HISAT2)"
hisat2 -p $NUMCPUS \
--dta \
-x ${GENOMEIDX} \
-1 ${FASTQLOC}/${reads1[$i]} \
-2 ${FASTQLOC}/${reads2[$i]} \
-S ${TEMPLOC}/${sample}.sam 2>${ALIGNLOC}/${sample}.alnstats
echo [`date +"%Y-%m-%d %H:%M:%S"`] " * Alignments conversion (SAMTools)"
samtools view -S -b ${TEMPLOC}/${sample}.sam | \
samtools sort -@ $NUMCPUS -o ${ALIGNLOC}/${sample}.bam -
rm ${TEMPLOC}/${sample}.sam
echo [`date +"%Y-%m-%d %H:%M:%S"`] " * Assemble transcripts (StringTie)"
stringtie -p $NUMCPUS \
-G ${GTFFILE} \
-o ${ALIGNLOC}/${sample}.gtf \
-l ${sample} ${ALIGNLOC}/${sample}.bam
done
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Step Two : Alignment Finished!"
|
Then run via PBS:
3rd, Estimate transcript abundance
Copy the codes below to 03estimation.pbs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
#!/bin/bash -x
#PBS -N ref_mRNA_03estimation
#PBS -o ref_mRNA_03estimation.out
#PBS -e ref_mRNA_03estimation.err
#PBS -q high
#PBS -j oe
#PBS -l nodes=1:ppn=28
NUMCPUS=`wc -l <$PBS_NODEFILE`
THREADS=`expr ${NUMCPUS} \* 2`
BASEDIR="${HOME}/project/mRNA"
FASTQLOC="${BASEDIR}/data/samples"
GENOMEIDX="${BASEDIR}/data/index/Hdh_tran"
GTFFILE="${BASEDIR}/data/genes/Hdh.gtf"
GENOMEFA="${BASEDIR}/data/genome/Hdh.fa"
WRKDIR="${BASEDIR}/output"
ALIGNLOC="${WRKDIR}/hisat2"
BALLGOWNLOC="${WRKDIR}/ballgown"
TEMPLOC="${WRKDIR}/tmp"
reads1=(${FASTQLOC}/*_1.*)
reads1=("${reads1[@]##*/}")
reads2=("${reads1[@]/_1./_2.}")
cd ${WRKDIR}
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: STRINGTIE Estimation"
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Merge all transcripts (StringTie)"
ls -1 ${ALIGNLOC}/*.gtf > ${ALIGNLOC}/mergelist.txt
stringtie --merge \
-p $NUMCPUS \
-G ${GTFFILE} \
-o ${BALLGOWNLOC}/stringtie_merged.gtf ${ALIGNLOC}/mergelist.txt
## Estimate transcript abundance
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Estimate abundance for each sample (StringTie)"
for ((i=0; i<=${#reads1[@]}-1; i++ )); do
sample="${reads1[$i]%%.*}"
sample="${sample%_*}"
if [ ! -d ${BALLGOWNLOC}/${sample} ]; then
mkdir -p ${BALLGOWNLOC}/${sample}
fi
# Estimate transcript abundance depend on the referenge genome with ignoring novel transcripts
stringtie -e -B \
-p $NUMCPUS \
-G ${GTFFILE} \
-o ${BALLGOWNLOC}/${sample}/${sample}.gtf ${ALIGNLOC}/${sample}.bam
done
# Generate transcript_count_matrix.csv and gene_count_matrix.csv using the script under stringtie_install_dir
python2 /public/tools/rna_seq/stringtie-1.3.4/prepDE.py -l 150
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Step Three: Estimation Finished!"
|
Then run via PBS:
1
2
|
qsub 03estimation.pbs
|
4th, Differential expression analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
ln -s /public/home/benthic/Data_benthic/practice/mRNA/phenodata.csv ${HOME}/project/mRNA/output/phenodata.csv
cd ${HOME}/project/mRNA/output
cat <<EOF >>deseq.r
library("DESeq2")
path=getwd()
countData <- as.matrix(read.csv(paste(path,"/transcript_count_matrix.csv",sep=""), row.names="transcript_id"))
colData <- read.csv(paste(path,"/phenodata.csv",sep=""), sep=",", row.names=1)
all(rownames(colData) %in% colnames(countData))
countData <- countData[, rownames(colData)]
all(rownames(colData) == colnames(countData))
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ group)
dds <- DESeq(dds)
# filter count<10
dds_filter <- dds[ rowSums(counts(dds)) > 10, ]
dds <- DESeq(dds_filter)
# plot PCA
rld <- rlog(dds)
pdf("pca_t.pdf")
plotPCA(rld, intgroup="group")
dev.off()
# take group H_Red and H_YX for examples
res<- results(dds,contrast=c("group","H_Red","C_Red"), alpha=0.05)
resOrdered <- res[order(res$padj), ]
diff_gene_deseq2 <-subset(resOrdered,padj < 0.05 & (log2FoldChange > 1 | log2FoldChange < -1))
diff_gene_deseq2 <- merge(as.data.frame(diff_gene_deseq2), as.data.frame(counts(dds,normalize=TRUE)), by="row.names",sort=FALSE)
write.csv(diff_gene_deseq2, file="./Red_HvC.sig.csv", row.names = F)
EOF
qsub -I -N DESeq2 -l nodes=1:ppn=2 -l walltime=1000:00:00 -q high `Rscript deseq.R`
|
5th, Funtional enrichment analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
cat <<EOF >>enrichment.R
sig_gene <- read.csv("./Red_HvC.sig.csv",header = T)
universe <- read.delim("/public/home/benthic/Data_benthic/practice/mRNA/Hdh_universe.txt",sep="\t",header=FALSE,stringsAsFactors = F)
go2gene <- read.delim("/public/home/benthic/Data_benthic/practice/mRNA/Hdh_go2gene.txt", sep="\t",header = FALSE,stringsAsFactors = F)
go2name <- read.delim("/public/home/benthic/Data_benthic/practice/mRNA/Hdh_go2term.txt", sep="\t",header = FALSE,stringsAsFactors = F)
map2gene <- read.delim("/public/home/benthic/Data_benthic/practice/mRNA/Hdh_map2gene.txt", sep="\t",header = FALSE,stringsAsFactors = F)
map2name <- read.delim("/public/home/benthic/Data_benthic/practice/mRNA/Hdh_map2term.txt", sep="\t",header = FALSE,stringsAsFactors = F)
library("clusterProfiler")
# ge enrichment
go=enricher(gene = sig_gene[,1],
universe = universe[,1],
TERM2GENE= go2gene,
TERM2NAME = go2name[,1:2])
pdf("go_enrich.pdf")
barplot(go,showcatergory=20)
#View(data.frame(go))
dev.off()
# kegg enrichment
kegg=enricher(gene = sig_gene[,1],
universe = universe[,1],
TERM2GENE= map2gene,
TERM2NAME = map2name)
pdf("kegg_enrich.pdf")
barplot(kegg,showcatergory=20)
dev.off()
EOF
qsub -I -N enrichment -l nodes=1:ppn=2 -l walltime=1000:00:00 -q high `Rscript enrichment.R`
|
Reference