6 Vraag6: Hox genes
6.1 A. Verkrijg de sequentie van de HOXA1 gen van de mens, bonobo aap en gorilla met behulp van Biomart
#Mart van vraag 1 nog beschikbaar:
#h_sapiens
#finding out names of marts for gorilla and bonobo
searchDatasets(h_sapiens, pattern = "(B|b)onobo")
## dataset description version
## 154 ppaniscus_gene_ensembl Bonobo genes (panpan1.1) panpan1.1
searchDatasets(h_sapiens, pattern = "(G|g)orilla")
## dataset description version
## 74 ggorilla_gene_ensembl Gorilla genes (gorGor4) gorGor4
#getting marts
<- useMart(biomart = "ensembl", dataset = "ppaniscus_gene_ensembl")
p_paniscus <- useMart(biomart = "ensembl", dataset = "ggorilla_gene_ensembl")
g_gorilla
#getting necessary filters and attributes
searchFilters(h_sapiens, pattern = "(G|g)ene.*name")
## name description
## 18 with_entrezgene_trans_name With EntrezGene transcript name ID(s)
## 64 external_gene_name Gene Name(s) [e.g. MT-TF]
## 76 entrezgene_trans_name EntrezGene transcript name ID(s) [e.g. AA06-201]
## 119 wikigene_name WikiGene name(s) [e.g. A1BG]
searchAttributes(h_sapiens, pattern = "ppaniscus.*homolog")
## name
## 626 ppaniscus_homolog_ensembl_gene
## 627 ppaniscus_homolog_associated_gene_name
## 628 ppaniscus_homolog_ensembl_peptide
## 629 ppaniscus_homolog_chromosome
## 630 ppaniscus_homolog_chrom_start
## 631 ppaniscus_homolog_chrom_end
## 632 ppaniscus_homolog_canonical_transcript_protein
## 633 ppaniscus_homolog_subtype
## 634 ppaniscus_homolog_orthology_type
## 635 ppaniscus_homolog_perc_id
## 636 ppaniscus_homolog_perc_id_r1
## 637 ppaniscus_homolog_goc_score
## 638 ppaniscus_homolog_wga_coverage
## 639 ppaniscus_homolog_orthology_confidence
## description page
## 626 Bonobo gene stable ID homologs
## 627 Bonobo gene name homologs
## 628 Bonobo protein or transcript stable ID homologs
## 629 Bonobo chromosome/scaffold name homologs
## 630 Bonobo chromosome/scaffold start (bp) homologs
## 631 Bonobo chromosome/scaffold end (bp) homologs
## 632 Query protein or transcript ID homologs
## 633 Last common ancestor with Bonobo homologs
## 634 Bonobo homology type homologs
## 635 %id. target Bonobo gene identical to query gene homologs
## 636 %id. query gene identical to target Bonobo gene homologs
## 637 Bonobo Gene-order conservation score homologs
## 638 Bonobo Whole-genome alignment coverage homologs
## 639 Bonobo orthology confidence [0 low, 1 high] homologs
searchAttributes(h_sapiens, pattern = "ggorilla.*homolog")
## name
## 1428 ggorilla_homolog_ensembl_gene
## 1429 ggorilla_homolog_associated_gene_name
## 1430 ggorilla_homolog_ensembl_peptide
## 1431 ggorilla_homolog_chromosome
## 1432 ggorilla_homolog_chrom_start
## 1433 ggorilla_homolog_chrom_end
## 1434 ggorilla_homolog_canonical_transcript_protein
## 1435 ggorilla_homolog_subtype
## 1436 ggorilla_homolog_orthology_type
## 1437 ggorilla_homolog_perc_id
## 1438 ggorilla_homolog_perc_id_r1
## 1439 ggorilla_homolog_goc_score
## 1440 ggorilla_homolog_wga_coverage
## 1441 ggorilla_homolog_orthology_confidence
## description page
## 1428 Gorilla gene stable ID homologs
## 1429 Gorilla gene name homologs
## 1430 Gorilla protein or transcript stable ID homologs
## 1431 Gorilla chromosome/scaffold name homologs
## 1432 Gorilla chromosome/scaffold start (bp) homologs
## 1433 Gorilla chromosome/scaffold end (bp) homologs
## 1434 Query protein or transcript ID homologs
## 1435 Last common ancestor with Gorilla homologs
## 1436 Gorilla homology type homologs
## 1437 %id. target Gorilla gene identical to query gene homologs
## 1438 %id. query gene identical to target Gorilla gene homologs
## 1439 Gorilla Gene-order conservation score homologs
## 1440 Gorilla Whole-genome alignment coverage homologs
## 1441 Gorilla orthology confidence [0 low, 1 high] homologs
#attributes: ggorilla_homolog_ensembl_gene, ppaniscus_homolog_ensembl_gene, external_gene_id
#filter: external_gene_name
<- getBM(attributes = c("ensembl_gene_id", "ggorilla_homolog_ensembl_gene", "ppaniscus_homolog_ensembl_gene"),
gene_id filters = "external_gene_name",
values = "HOXA1",
mart = h_sapiens)
%>% kable() gene_id
ensembl_gene_id | ggorilla_homolog_ensembl_gene | ppaniscus_homolog_ensembl_gene |
---|---|---|
ENSG00000105991 | ENSGGOG00000002209 | ENSPPAG00000000155 |
<- biomaRt::getSequence(id = "ENSG00000105991", type = "ensembl_gene_id", seqType = "coding", mart = h_sapiens)
h_sapiens_HOXA1 <- biomaRt::getSequence(id = "ENSGGOG00000002209", type = "ensembl_gene_id", seqType = "coding", mart = g_gorilla)
g_gorilla_HOXA1 <- biomaRt::getSequence(id = "ENSPPAG00000000155", type = "ensembl_gene_id", seqType = "coding", mart = p_paniscus)
p_paniscus_HOXA1
#checking for duplicates
for (i in c("h_sapiens_HOXA1", "g_gorilla_HOXA1", "p_paniscus_HOXA1")){
print(c(i, duplicated(as.vector(get(i)$coding))))
}
## [1] "h_sapiens_HOXA1" "FALSE" "FALSE"
## [1] "g_gorilla_HOXA1" "FALSE" "FALSE"
## [1] "p_paniscus_HOXA1" "FALSE" "FALSE"
The sequences have been retrieved from the ensembl database, but per ensembl gene id there are two unique coding sequences
<- bind_rows(h_sapiens_HOXA1, g_gorilla_HOXA1, p_paniscus_HOXA1)
table_coding <- bind_cols(table_coding, tibble(species = c("h_sapiens",
table_coding "h_sapiens",
"g_gorilla",
"g_gorilla",
"p_paniscus",
"p_paniscus")))
%>%
table_coding ::kable() %>%
knitrcolumn_spec(column = 1, width = "10cm")
coding | ensembl_gene_id | species |
---|---|---|
ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCCATACTTAGCAGTGGCGACTCGGGGACCTGCTCAGCCCGAGCCTACCCCTCGGACCATAGGATTACAACTTTCCAGTCGTGCGCGGTCAGCGCCAACAGTTGCGGCGGCGACGACCGCTTCCTAGTGGGCAGGGGGGTGCAGATCGGTTCGCCCCACCACCACCACCACCACCACCATCGCCACCCCCAGCCGGCTACCTACCAGACTTCCGGGAACCTGGGGGTGTCCTACTCCCACTCAAGTTGTGGTCCAAGCTATGGCTCACAGAACTTCAGTGCGCCTTACAGCCCCTACGCGTTAAATCAGGAAGCAGACCCACCAAGAAGCCTGTCGCTCCCCCGCATCGGAGACATCTTCTCCAGCGCAGACTTTTGA | ENSG00000105991 | h_sapiens |
ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCCATACTTAGCAGTGGCGACTCGGGGACCTGCTCAGCCCGAGCCTACCCCTCGGACCATAGGATTACAACTTTCCAGTCGTGCGCGGTCAGCGCCAACAGTTGCGGCGGCGACGACCGCTTCCTAGTGGGCAGGGGGGTGCAGATCGGTTCGCCCCACCACCACCACCACCACCACCATCGCCACCCCCAGCCGGCTACCTACCAGACTTCCGGGAACCTGGGGGTGTCCTACTCCCACTCAAGTTGTGGTCCAAGCTATGGCTCACAGAACTTCAGTGCGCCTTACAGCCCCTACGCGTTAAATCAGGAAGCAGACGTAAGTGGTGGGTACCCCCAGTGCGCTCCCGCTGTTTACTCTGGAAATCTCTCATCTCCCATGGTCCAGCATCACCACCACCACCAGGGTTATGCTGGGGGCGCGGTGGGCTCGCCTCAATACATTCACCACTCATATGGACAGGAGCACCAGAGCCTGGCCCTGGCTACGTATAATAACTCCTTGTCCCCTCTCCACGCCAGCCACCAAGAAGCCTGTCGCTCCCCCGCATCGGAGACATCTTCTCCAGCGCAGACTTTTGACTGGATGAAAGTCAAAAGAAACCCTCCCAAAACAGGGAAAGTTGGAGAGTACGGCTACCTGGGTCAACCCAACGCGGTGCGCACCAACTTCACTACCAAGCAGCTCACGGAACTGGAGAAGGAGTTCCACTTCAACAAGTACCTGACGCGCGCCCGCAGGGTGGAGATCGCTGCATCCCTGCAGCTCAACGAGACCCAAGTGAAGATCTGGTTCCAGAACCGCCGAATGAAGCAAAAGAAACGTGAGAAGGAGGGTCTCTTGCCCATCTCTCCGGCCACCCCGCCAGGAAACGACGAGAAGGCCGAGGAATCCTCAGAGAAGTCCAGCTCTTCGCCCTGCGTTCCTTCCCCGGGGTCTTCTACCTCAGACACTCTGACTACCTCCCACTGA | ENSG00000105991 | h_sapiens |
ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCTATACTTAGCAGTGGCGACTCGGGGACCTGCTCAGCCCGAGCCTACCCCTCGGACCATGGGATTACAACTTTCCAGTCGTGCGCGGTCAGTGCCAACAGTTGCGGCGGCGACGACCGCTTCCTAGTGGGCAGGGGGGTGCAGATTGGTTCGCCCCACCAGCACCACCACCACCACCATCACCACCCCCAGCCGGCTACCTACCAGACTTCCGGGAACCTGGGGGTGTCCTACTCCCACTCGAGTTGTGGTCCAAGCTATGGCTCACAGAACTTCAGTGCGCCTTACAGCCCCTACGCGTTAAATCAGGAAGCAGACGTAAGTGGTGGGTACCCCCAGTGCGCTCCCGCTGTTTACTCTGGAAATCTCTCATCTCCCATGGTCCAGCATCACCACCACCACCAGGGTTATGCTGGGGGCGCGGTGGGCTCGCCTCAATACATTCACCACTCATATGGACAGGAGCACCAGAGCCTGGCCCTGGCTACGTATAATAACTCCTTGTCCCCTCTCCACGCCAGCCACCAAGAAGCCTGTCGCTCCCCTGCATCGGAGACATCTTCTCCAGCGCAGACTTTTGACTGGATGAAAGTCAAAAGAAACCCTCCCAAAACAGGGAAAGTTGGAGAGTACGGCTACCTGGGTCAACCCAACGCGGTGCGTACCAACTTCACTACCAAGCAGCTCACGGAACTGGAGAAGGAGTTCCACTTCAACAAGTACCTGACGCGCGCCCGCAGAGTGGAGATCGCTGCATCCCTGCAGCTCAACGAGACCCAAGTGAAGATCTGGTTCCAGAACCGCCGAATGAAGCAAAAGAAACGTGAGAAGGAGGGTCTCTTGCCCATCTCTCCGGCCACCCCGCCAGGAAACGACGAGAAGGCCGAGGAATCCTCAGAGAAGTCCAGCTCTTCGCCCTGCGTTCCTTCCCCGGGGTCTTCTACCTCAGACACTCTGACTACCTCCCACTGA | ENSGGOG00000002209 | g_gorilla |
ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCTATACTTAGCAGTGGCGACTCGGGGACCTGCTCAGCCCGAGCCTACCCCTCGGACCATGGGATTACAACTTTCCAGTCGTGCGCGGTCAGTGCCAACAGTTGCGGCGGCGACGACCGCTTCCTAGTGGGCAGGGGGGTGCAGATTGGTTCGCCCCACCAGCACCACCACCACCACCATCACCACCCCCAGCCGGCTACCTACCAGACTTCCGGGAACCTGGGGGTGTCCTACTCCCACTCGAGTTGTGGTCCAAGCTATGGCTCACAGAACTTCAGTGCGCCTTACAGCCCCTACGCGTTAAATCAGGAAGCAGACCCACCAAGAAGCCTGTCGCTCCCCTGCATCGGAGACATCTTCTCCAGCGCAGACTTTTGA | ENSGGOG00000002209 | g_gorilla |
ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCCATACTTAGCAGTGGCGACTCGGGGACCTGCCCAGCCCGAGCCTACCCCTCGGACCATGGGATTACAACTTTCCAGTCGTGCGCGGTCAGTGCCAACAGTTGCGGCGGCGACGACCGCTTCCTAATGGGCAGGGGGGTGCAGATCGGTTCGCCCCACCACCACCACCACCACCACCACCATCACCACCCCCAGCCGGCTACCTACCAGACTTCCGGGAACCTGGGGGTGTCCTACTCCCACTCGAGTTGTGGTCCAAGCTATGGCTCACAGAACTTCAGTGCGCCTTACAGCCCCTACGCGTTAAATCAGGAAGCAGACGTAAGTGGTGGGTACCCCCAGTGCGCTCCCGCTGTTTACTCTGGAAATCTCTCATCTCCCATGGTCCAGCATCACCACCACCACCAGGGTTATGCTGGGGGCGCGGTGGGCTCGCCTCAATACATTCACCACTCATATGGACAGGAGCACCAGAGCCTGGCCCTGGCTACGTATAATAACTCCTTGTCCCCTCTCCACGCCAGCCACCAAGAAGCCTGTCGCTCCCCTGCATCGGAGACATCTTCTCCAGCGCAGACTTTTGACTGGATGAAAGTCAAAAGAAACCCTCCCAAAACAGGGAAAGTTGGAGAGTACGGCTACCTGGGTCAACCCAACGCGGTGCGCACCAACTTCACTACCAAGCAGCTCACGGAACTGGAGAAGGAGTTCCACTTCAACAAGTACCTGACGCGCGCCCGCAGGGTGGAGATCGCTGCATCCCTGCAGCTCAACGAGACCCAAGTGAAGATCTGGTTCCAGAACCGCCGAATGAAGCAAAAGAAACGTGAGAAGGAGGGTCTCTTGCCCATCTCTCCGGCCACCCCGCCAGGAAACGACGAGAAGGCCGAGGAATCCTCAGAGAAGTCCAGCTCTTCGCCCTGCGTTCCTTCCCCGGGGTCTTCTACCTCAGACACTCTGACTACCTCCCACTGA | ENSPPAG00000000155 | p_paniscus |
ATGGACAATGCAAGAATGAACTCCTTCCTGGAATACCCCATACTTAGCAGTGGCGACTCGGGGACCTGCCCAGCCCGAGCCTACCCCTCGGACCATGGGATTACAACTTTCCAGTCGTGCGCGGTCAGTGCCAACAGTTGCGGCGGCGACGACCGCTTCCTAATGGGCAGGGGGGTGCAGATCGGTTCGCCCCACCACCACCACCACCACCACCACCATCACCACCCCCAGCCGGCTACCTACCAGACTTCCGGGAACCTGGGGGTGTCCTACTCCCACTCGAGTTGTGGTCCAAGCTATGGCTCACAGAACTTCAGTGCGCCTTACAGCCCCTACGCGTTAAATCAGGAAGCAGACCCACCAAGAAGCCTGTCGCTCCCCTGCATCGGAGACATCTTCTCCAGCGCAGACTTTTGA | ENSPPAG00000000155 | p_paniscus |
6.2 B. Voer een sequence alignment uit op de eiwit sequentie van deze genen
<- Biostrings::DNAStringSet(table_coding$coding)
seq names(seq) <- paste(table_coding$species , table_coding$ensembl_gene_id, sep = "_")
<- AlignTranslation(seq, type = "AAStringSet") AA
## Determining distance matrix based on shared 5-mers:
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Clustering into groups by similarity:
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Aligning Sequences:
##
|
| | 0%
|
|============== | 20%
|
|============================ | 40%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|=======================================================================| 100%
##
## Time difference of 0.03 secs
##
## Iteration 1 of 2:
##
## Determining distance matrix based on alignment:
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Reclustering into groups by similarity:
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Realigning Sequences:
##
|
| | 0%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Alignment converged - skipping remaining iteration.
BrowseSeqs(AA)

Sequence aligment
6.3 C. Maak een dendrogram van de eiwit sequenties
#for StaggerAlignment sequences
<- StaggerAlignment(AA) AA_stag
## Calculating distance matrix:
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Constructing neighbor-joining tree:
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
##
## Staggering insertions and deletions:
##
|
| | 0%
|
|===================== | 30%
|
|========================== | 36%
|
|============================================= | 64%
|
|========================================================= | 80%
|
|========================================================== | 81%
|
|=======================================================================| 100%
##
## Time difference of 0.01 secs
<- DistanceMatrix(AA_stag) AA_dis
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
##
## Time difference of 0 secs
<- IdClusters(AA_dis, type = "both") dendogram
##
|
| | 0%
|
|======================= | 33%
|
|=========================================== | 60%
|
|========================================================= | 80%
|
|================================================================== | 93%
|
|=======================================================================| 100%
## Warning in IdClusters(AA_dis, type = "both"): Duplicated labels in dendrogram
## appended with index.
## Warning in IdClusters(AA_dis, type = "both"): Duplicated labels in myDistMatrix
## appended with index.
##
## Time difference of 0 secs
plot(dendogram[[2]])