kokocinski.net

Diese Seite drucken

Ensembl coding

The EnsEMBL project not only provides data on genome annotation, it also maintains a powerful interface
to work with this data programatically (API). The large set of modules written in Perl can for example be used to
  • fetch information on your favorite genomic region
  • get data on specific genes in different organisms
  • annotate the clones on your microarray
  • start your own gene-prediction project
There is an API in the language JAVA as well, this is not being developed further, though. An API in Ruby is being developed externally and aviabable here. Of special interest is also the BioMart system which allows complex cross-database queries in a simple interface.
More information can be found on the EnsEMBL pages. For specific coding questions subscribe to the mailing list.

Some simple code examples to get you started

1. connect to the database [\u2212]
#connect to defined database using the DBAdaptor
# (used in examples 1-8)

use Bio::EnsEMBL::DBSQL::DBAdaptor;
my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor(
-host => 'ensembldb.ensembl.org',
-dbname => 'homo_sapiens_core_42_36b',
-user => 'anonymous',
);

#OR connect automaticall using a the Registry
# (used in example 9 ff.)
#please see http://www.ensembl.org/info/software/registry/index.html use Bio::EnsEMBL::Registry; Bio::EnsEMBL::Registry->load_registry_from_db( -host => 'ensembldb.ensembl.org', -user => 'anonymous' );

2. fetch a specific chromosome [\u2212]
# [connect (1)]

my $chrom = "X";
my $slice_adaptor = $db->get_SliceAdaptor;
my $slice = $slice_adaptor->fetch_by_region("chromosome", $chrom);
print "\nhave slice of ".$slice->seq_region_name." ".
$slice->seq_region_start."-".$slice->seq_region_end;

3. fetch a specific genomic region [\u2212]
# [connect (1)]

my $chrom = "X";
my $start = 100000;
my $end = 200000;
my $strand = 1;
my $slice_adaptor = $db->get_SliceAdaptor;
my $slice = $slice_adaptor->fetch_by_region(
"chromosome",
$chrom,
$start,
$end,
$strand);
print "\nhave slice of ".$slice->seq_region_name." ".
$slice->seq_region_start."-".$slice->seq_region_end;

4. fetch all chromosomes [\u2212]
# [connect (1)]

my @chromosomes;
foreach my $chr ( @{ $slice_adaptor->fetch_all('chromosome') } ) {
#print out information
print $chr->seq_region_name.", ".$chr->start." - ".$chr->end."\n";
#store the names
push @chromosomes, $chr->seq_region_name;
#or work with the chromosome...
}

5. fetch genes [\u2212]
# [connect (1)]

#all genes from a slice, e.g. a chromosome
# [get_slice]
my @genes = @{ $slice->get_all_Genes() };

#specific gene, using EnsEMBL-ID
my $gene_adaptor = $db->get_GeneAdaptor;
my $gene = $gene_adaptor->fetch_by_stable_id("ENSG00000147892");

#specific gene, using gene symbol (short name)
my $gene_adaptor = $db->get_GeneAdaptor;
my @genes = @{$gene_adaptor->fetch_all_by_external_name("ADAMTSL1")};

6. get more information on the gene [\u2212]
# [connect (1)]

# [get_gene (5)]

#print basic information
print "genomic localization:\t".$gene->seq_region_name.
"\t".$gene->start."\t".$gene->end.
"\t".$gene->strand."\t".$gene->biotype."\n";
print "description:\t".$gene->description."\n";
#print further infos, ids
foreach my $dbEntry ( @{ $gene->get_all_DBEntries } ) {
if( $dbEntry->database eq "HUGO" or
$dbEntry->database eq "HGNC"){
$symbol = "symbol:\t".$dbEntry->display_id."\n";
}
else{
print "ID:\t".$dbEntry->database.":\t".
$dbEntry->display_id."\n";
}
}

7. work with the gene [\u2212]
# [connect (1)]

# [get_gene (5)]

foreach my $transcript ( @{ $gene->get_all_Transcripts } ) {
print $transcript->dbID."\t".$transcript->start."\t".
$transcript->end."\t".$transcript->strand."\n";
foreach my $exon ( @{ $transcript->get_all_Exons } ) {
print "\t".$exon->dbID."\t".$exon->start."\t".
$exon->end."\n";
}
}

8. use SQL to fetch information [\u2212]
# [connect (1)]

my $query = "SELECT gene_id, seq_region_id, seq_region_start,
seq_region_end, seq_region_strand FROM gene LIMIT 5;";
my $sth = $db->dbc->prepare($query);
$sth->execute();
while(my ($id, $region, $start, $end, $strand) = $sth->fetchrow) {
print "gene $id: $region, $start, $end, $strand\n";
}

9. get homologues genes using EnsEMBL::Compara [\u2212]
use Bio::EnsEMBL::DBSQL::DBAdaptor;
use Bio::EnsEMBL::Compara::DBSQL::DBAdaptor;

#use Registry file for a simple connection setup,
#please see http://www.ensembl.org/info/software/registry/index.html use Bio::EnsEMBL::Registry; Bio::EnsEMBL::Registry->load_registry_from_db( -host => 'ensembldb.ensembl.org', -user => 'anonymous' ); #get compara adaptors my $ma = Bio::EnsEMBL::Registry->get_adaptor( 'compara', 'compara', 'Member') or die "\n$@\ncan't get adaptor 1.\n"; my $ha = Bio::EnsEMBL::Registry->get_adaptor( 'compara', 'compara', 'Homology') or die "\n$@\ncan't get adaptor 2.\n"; #fetch human gene from core database my $query_species = "Homo_sapiens"; my $gene_id = "ENSG00000147892"; #fetch source gene my $member = $ma->fetch_by_source_stable_id( "ENSEMBLGENE", $gene_id) or return 0; my $sourceGenome = $member->genome_db->dbID; print "\nsource gene ($query_species): ".$member->stable_id; #get all homologues from other species my $other_species = "Mus_musculus"; my $homologies = $ha->fetch_by_Member_paired_species( $member, $other_species); #or from all species #my $homologies = $ha->fetch_by_Member($member); #display all results foreach my $homologie (@$homologies) { foreach my $member_attrib (@{$homologie->get_all_Member_Attribute}) { my ($newmember, $attrib) = @$member_attrib; if ($newmember->genome_db->dbID != $sourceGenome) { print "\nhomologue: ".$newmember->stable_id. " / ".$newmember->taxon_id. ": ".$newmember->chr_name. " ".$newmember->chr_start. "-".$newmember->chr_end; } } }

10. get GeneOntology term for a gene using EnsEMBL & GOApph [\u2212]
use Bio::EnsEMBL::DBSQL::DBAdaptor;
# [connect (1)]

#use GO::AppHandle for GO logic if possible!
use GO::AppHandle;
my %args = (
-dbhost => 'sin.lbl.gov',
-dbname => 'go',
);
my $apph = GO::AppHandle->connect( \%args );

# [get_gene (5)]

#get GO infos
if ( $gene->is_known ) {
foreach $link ( @{ $gene->get_all_DBLinks } ) {
if ( $link->database eq "GO" ) {

#show GO term
print $link->display_id;

#get the ancester terms
foreach my $go (@GOs1) {
get_parent($go);
}

}
}
}

#fetch all parent terms recursively
sub get_parent($) {
my $term = shift;
my $parent_term;
my $type;
my $parent_terms;

$parent_terms = $apph->get_parent_terms($term);
foreach $parent_term (@$parent_terms) {
get_parent($parent_term);
}
$parent_term = $term->name();
if ( ( $parent_term ne "Gene_Ontology" )
&& ( $parent_term ne "molecular_function" )
&& ( $parent_term ne "cellular_component" )
&& ( $parent_term ne "biological_process" ) ) {
print $parent_term."(".$term->type "), ";
}
}

11. get GeneOntology term for a gene using EnsEMBL only [\u2212]
use Bio::EnsEMBL::DBSQL::DBAdaptor;

# connect with general registry
my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_registry_from_db(
                 '-host'       => 'ensembldb.ensembl.org',
                 '-user'       => 'anonymous',
                 '-db_version' => '58',
                 '-verbose'    => '0',
                );
# get adaptors
my $goa  = $registry->get_adaptor( 'Multi', 'Ontology', 'GOTerm' ) or die "Cant get GO adaptor\n";
my $ga   = $registry->get_adaptor( 'Human', 'Core', 'Gene' );

my $id = "ENSG00000006062";
my $gene = $ga->fetch_by_stable_id($id);

#get GO infos
foreach my $link ( @{ $gene->get_all_DBLinks } ) {
  if ( $link->database eq "GO" ) {
    my $term_id = $link->display_id;
    my $term_name = '-';
    my $term = $goa->fetch_by_accession($term_id);
    if($term and $term->name){
      $term_name = $term->name;
    }
    print $gene->stable_id.": $term_id ($term_name)\n";

    #fetch complete GO hierachy
    foreach my $ancestor_term (@{ $term->ancestors() }){
      print "\t".$ancestor_term->accession." (".$ancestor_term->name.")\n";
    }

  }
}