RNA-seq Ref-align Analysis (1)

2020-05-08 about 909 words 5 min

Contents

Introduction

This RNA-seq pipeline was built with HISAT2 + Stringtie + DESeq2 + clusterProfiler.

1st, Prepariation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


# Directory Deployment
mkdir -p ${HOME}/project/mRNA/{bin,data/{genes,genome,index,samples},output/{ballgown,hisat2,tmp}}


# Data link
cd ${HOME}/../Data_benthic/practice/mRNA
ln -s ${PWD}/Hdh.fa ${HOME}/project/mRNA/data/genome/
ln -s ${PWD}/Hdh.gff3 ${HOME}/project/mRNA/data/genes/

for i in `find ${PWD} -type f -name "*.fq.gz"`;
do
	ln -s ${i} ${HOME}/project/mRNA/data/samples/;
done

2nd, Alignment

1

cd ${HOME}/project/mRNA/bin

Copy the codes below to 02alignment.pbs

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71


#!/bin/bash -x
#PBS -N ref_mRNA_02alignment
#PBS -o ref_mRNA_02alignment.out
#PBS -e ref_mRNA_02alignment.err
#PBS -q high
#PBS -j oe
#PBS -l nodes=1:ppn=28

NUMCPUS=`wc -l <$PBS_NODEFILE`
THREADS=`expr ${NUMCPUS} \* 2`
BASEDIR="${HOME}/project/mRNA"
FASTQLOC="${BASEDIR}/data/samples"
GENOMEIDX="${BASEDIR}/data/index/Hdh_tran"
GTFFILE="${BASEDIR}/data/genes/Hdh.gtf"
GENOMEFA="${BASEDIR}/data/genome/Hdh.fa"
WRKDIR="${BASEDIR}/output"
ALIGNLOC="${WRKDIR}/hisat2"
BALLGOWNLOC="${WRKDIR}/ballgown"
TEMPLOC="${WRKDIR}/tmp"

# transfer gff to gtf
cd ${BASEDIR}/data/genes/
gffread Hdh.gff -T -o ${GTFFILE}

# Extract splice sites and exon info from gtf using the script under hisat_install_dir
cd ${BASEDIR}/data/index
python2 /public/tools/rna_seq/hisat2-2.1.0/extract_splice_sites.py ${GTFFILE} > Hdh.ss
python2 /public/tools/rna_seq/hisat2-2.1.0/extract_exons.py ${GTFFILE} > Hdh.exon

# Build genome index for hisat2
hisat2-build -p 50 --ss Hdh.ss --exon Hdh.exon ${GENOMEFA} Hdh_tran

## sample pair-end fastq must be named in this format “_1.*”"_2.*". eg, "gill01_1.fastq.gz","gill01_2.fastq.gz"
reads1=(${FASTQLOC}/*_1.*)
reads1=("${reads1[@]##*/}")
reads2=("${reads1[@]/_1./_2.}")


cd ${WRKDIR}

echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: HISAT2 Alignment"
for ((i=0; i<=${#reads1[@]}-1; i++ )); do
	sample="${reads1[$i]%%.*}"
	sample="${sample%_*}"

	stime=`date +"%Y-%m-%d %H:%M:%S"`
	echo "[$stime] Processing sample: $sample"
	echo [$stime] "   * Alignment of reads to genome (HISAT2)"

	hisat2 -p $NUMCPUS \
		--dta \
		-x ${GENOMEIDX} \
		-1 ${FASTQLOC}/${reads1[$i]} \
		-2 ${FASTQLOC}/${reads2[$i]} \
		-S ${TEMPLOC}/${sample}.sam 2>${ALIGNLOC}/${sample}.alnstats

echo [`date +"%Y-%m-%d %H:%M:%S"`] "   * Alignments conversion (SAMTools)"
	samtools view -S -b ${TEMPLOC}/${sample}.sam | \
		samtools sort -@ $NUMCPUS -o ${ALIGNLOC}/${sample}.bam -

	rm ${TEMPLOC}/${sample}.sam

echo [`date +"%Y-%m-%d %H:%M:%S"`] "   * Assemble transcripts (StringTie)"
	stringtie -p $NUMCPUS \
		-G ${GTFFILE} \
		-o ${ALIGNLOC}/${sample}.gtf \
		-l ${sample} ${ALIGNLOC}/${sample}.bam

done

echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Step Two : Alignment Finished!"

Then run via PBS:

1
2


qsub 02alignment.pbs

3rd, Estimate transcript abundance

Copy the codes below to 03estimation.pbs

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60


#!/bin/bash -x
#PBS -N ref_mRNA_03estimation
#PBS -o ref_mRNA_03estimation.out
#PBS -e ref_mRNA_03estimation.err
#PBS -q high
#PBS -j oe
#PBS -l nodes=1:ppn=28

NUMCPUS=`wc -l <$PBS_NODEFILE`
THREADS=`expr ${NUMCPUS} \* 2`
BASEDIR="${HOME}/project/mRNA"
FASTQLOC="${BASEDIR}/data/samples"
GENOMEIDX="${BASEDIR}/data/index/Hdh_tran"
GTFFILE="${BASEDIR}/data/genes/Hdh.gtf"
GENOMEFA="${BASEDIR}/data/genome/Hdh.fa"
WRKDIR="${BASEDIR}/output"
ALIGNLOC="${WRKDIR}/hisat2"
BALLGOWNLOC="${WRKDIR}/ballgown"
TEMPLOC="${WRKDIR}/tmp"

reads1=(${FASTQLOC}/*_1.*)
reads1=("${reads1[@]##*/}")
reads2=("${reads1[@]/_1./_2.}")

cd ${WRKDIR}

echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: STRINGTIE Estimation"

echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Merge all transcripts (StringTie)"
ls -1 ${ALIGNLOC}/*.gtf > ${ALIGNLOC}/mergelist.txt

stringtie --merge \
	-p $NUMCPUS \
	-G  ${GTFFILE} \
	-o ${BALLGOWNLOC}/stringtie_merged.gtf ${ALIGNLOC}/mergelist.txt

## Estimate transcript abundance
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Estimate abundance for each sample (StringTie)"
for ((i=0; i<=${#reads1[@]}-1; i++ )); do

	sample="${reads1[$i]%%.*}"
	sample="${sample%_*}"

	if [ ! -d ${BALLGOWNLOC}/${sample} ]; then
		mkdir -p ${BALLGOWNLOC}/${sample}
	fi

# Estimate transcript abundance depend on the referenge genome with ignoring novel transcripts
	stringtie -e -B \
		-p $NUMCPUS \
		-G ${GTFFILE} \
		-o ${BALLGOWNLOC}/${sample}/${sample}.gtf ${ALIGNLOC}/${sample}.bam

done

# Generate transcript_count_matrix.csv and gene_count_matrix.csv using the script under stringtie_install_dir

python2 /public/tools/rna_seq/stringtie-1.3.4/prepDE.py -l 150

echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Step Three: Estimation Finished!"