参考基因组及注释文件
最后发布时间:2022-12-20 15:33:06
浏览量:
统计基因组中碱基的个数
cat ${testData}/RNA-seq/genomic/chr22_with_ERCC92.fa | grep -v ">" | perl -ne 'chomp $_; $bases{$_}++ for split //; if (eof){print "$_ $bases{$_}\n" for sort keys %bases}'
GRCh37和GRCh38都是Genome Reference Consortium(GRC)的人类基因组组装。GRCh38(也称为“build 38”)是在2009年GRCh37发布四年后发布的,因此它可以被视为一个版本,其中包含对早期版本的更新注释。
- GRCh38 Genome Reference Consortium Human Build 38 Organism:
参考基因的下载
- UCSC
- Ensembl
- NCBI
GENCODE
hg19与hg38序列的比较
library(tidyverse)
library("scales")
library(rtracklayer)
# gtf_data = import('reference/gencode.v39.annotation.gtf')
gtf_data = import('reference/gencode.v19.annotation.gtf.gz')
gtf_data = as.data.frame(gtf_data)
write_tsv(gtf_data, file="gtf_data.tsv")
gtf_data <- read_tsv("gtf_data.tsv")
chrom_order <- c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7",
"chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21",
"chr22", "chrX", "chrY", "chrM")
chrom_key <- setNames(object = as.character(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25)),
nm = chrom_order)
chrom_order <- factor(x = chrom_order, levels = rev(chrom_order))
chrom_sizes2 <- gtf_data |>
mutate(chromosome=seqnames) |>
group_by(chromosome) |>
summarise(size=max(end)-min(start))
chrom_sizes2$chromosome <- factor(x=chrom_sizes2$chromosome, levels = chrom_order)
sample_cns <- gtf_data |>
filter(gene_type == "protein_coding") |>
select(chromosome=seqnames,start,end,gene_type)
sample_cns$chromosome <- factor(x=sample_cns$chromosome, levels = chrom_order)
ggplot(data = chrom_sizes2) +
# base rectangles for the chroms, with numeric value for each chrom on the x-axis
geom_rect(aes(xmin = as.numeric(chromosome) - 0.2,
xmax = as.numeric(chromosome) + 0.2,
ymax = size, ymin = 0),
colour="black", fill = "white") +
# rotate the plot 90 degrees
coord_flip() +
theme(axis.text.x = element_text(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),legend.position="bottom") +
scale_x_discrete(name = "chromosome", limits = names(chrom_key)) +
geom_rect(data = sample_cns, aes(xmin = as.numeric(chromosome) - 0.2,
xmax = as.numeric(chromosome) + 0.2,
ymax = end, ymin = start)) +labs(title="gencode.GRCh37.p13.v19")