Yikang Test
2018, Oct 15
<!DOCTYPE html>
Globi database exploration:¶
In [2]:
import pandas as pd
import pytaxize
import re
import matplotlib.pyplot as plt
Import interaction data:¶
In [3]:
data =pd.read_csv('/Users/iamciera/Desktop/interactions.tsv', delimiter='\t', encoding='utf-8')
In [5]:
data.head()
Out[5]:
In [6]:
data['interactionTypeName'].unique()
Out[6]:
Drop duplicates:¶
In [7]:
data.drop_duplicates(['sourceTaxonId', 'interactionTypeName', 'targetTaxonId'], inplace = True)
In [8]:
len(data)
Out[8]:
Data Exploration:¶
Let's look at certain taxon:¶
For example, suppose we are interested in the interactions involving 'Homo sapiens'
In [9]:
#Types of interactions involving Homo sapiens as sourceTaxon:
data[data['sourceTaxonName'] == 'Homo sapiens']['interactionTypeName'].unique()
Out[9]:
In [10]:
#Number of records of interactions involving Homo sapiens as sourceTaxon:
len(data[data['sourceTaxonName'] == 'Homo sapiens'])
Out[10]:
Let's focus on certain type of interaction involving Homo_sapiens as sourceTaxon, for example "eats":¶
In [11]:
hs_eats_data = data[(data['sourceTaxonName'] == 'Homo sapiens') & (data['interactionTypeName'] == 'eats')]
In [12]:
hs_eats_data.head()
Out[12]:
In [13]:
len(hs_eats_data)
Out[13]:
In [14]:
#Drop missing values
target_hs_eats = hs_eats_data[['targetTaxonId',
'targetTaxonName','targetTaxonPathNames',
'targetTaxonPathIds', 'targetTaxonPathRankNames',
'targetTaxonSpeciesName', 'targetTaxonSpeciesId',
'targetTaxonGenusName', 'targetTaxonGenusId', 'targetTaxonFamilyName',
'targetTaxonFamilyId', 'targetTaxonOrderName', 'targetTaxonOrderId',
'targetTaxonClassName', 'targetTaxonClassId', 'targetTaxonPhylumName',
'targetTaxonPhylumId', 'targetTaxonKingdomName', 'targetTaxonKingdomId']].dropna(subset=['targetTaxonId',
'targetTaxonName','targetTaxonPathNames','targetTaxonPathIds'])
target_hs_eats.head()
Out[14]:
In [15]:
len(target_hs_eats)
Out[15]:
In [16]:
target_hs_eats.groupby(target_hs_eats['targetTaxonClassName']).size().sort_values(ascending = False)
Out[16]:
Above all, we have found a list of top target classes of 'Homo sapiens' for interactiontype 'eats'.
Similarly, we could get a list of any rank for any source taxon and any interactiontype by the following function 'find_top_target':
In [17]:
def find_top_target(source, interaction_type, rank):
""" Function that takes inputs of interests and finds corresponding top targets.
Args:
source: the source taxon that we are interested in, can be in any level.
interaction_type: the interaction type that we are interested in,
should be consistent with the names of interaction types from tsv.file.
rank: the level of target taxon that we are interested in,
should be consistent with the column names of tsv.file, such as 'targetTaxonFamilyName', 'targetTaxonOrderName',
'targetTaxonClassName'...
Returns:
The top target taxons in certain rank for certain source taxon and certain interaction type,
in descending order of number of records.
"""
d = data[data['sourceTaxonName'] == source]
interacts_d = d[d['interactionTypeName'] == interaction_type]
interacts_d_cleaned = interacts_d[['targetTaxonId',
'targetTaxonName','targetTaxonPathNames',
'targetTaxonPathIds', 'targetTaxonPathRankNames',
'targetTaxonSpeciesName', 'targetTaxonSpeciesId',
'targetTaxonGenusName', 'targetTaxonGenusId', 'targetTaxonFamilyName',
'targetTaxonFamilyId', 'targetTaxonOrderName', 'targetTaxonOrderId',
'targetTaxonClassName', 'targetTaxonClassId', 'targetTaxonPhylumName',
'targetTaxonPhylumId', 'targetTaxonKingdomName', 'targetTaxonKingdomId']].dropna(subset=['targetTaxonId',
'targetTaxonName','targetTaxonPathNames','targetTaxonPathIds'])
return interacts_d_cleaned.groupby(interacts_d_cleaned[rank]).size().sort_values(ascending = False)
Examples:
In [18]:
#Find top target taxons in Class for homo sapiens with interaction type 'eats'
find_top_target('Homo sapiens', 'eats', 'targetTaxonClassName')
Out[18]:
In [19]:
#Find top target taxons in Family for homo sapiens with interaction type 'hostOf'
find_top_target('Homo sapiens', 'hostOf', 'targetTaxonFamilyName')
Out[19]:
Instead of inputting a source species, what if we input a source in other levels like class or family?¶
In [20]:
#Find top target taxons in Class for Actinopterygii with interaction type 'preysOn'
find_top_target('Actinopterygii', 'preysOn', 'targetTaxonClassName')
Out[20]:
Here, the source 'Actinopterygii' itself is in Class level.
And we can see that the top target Class of 'Actinopterygii' preys on is also 'Actinopterygii', which means the species under 'Actinopterygii' always preys on species under same Class.
Link our results with wikipedia pages:¶
If we want to know more about our result taxons, we can also link them with their wikipedia pages:
In [21]:
def make_clickable_both(val):
name, url = val.split('#')
return f'<a href="{url}">{name}</a>'
In [22]:
def top_targets_with_wiki(source, interaction_type, rank):
""" Function that takes inputs of interests and finds corresponding top targets linked to their wikipedia pages.
Args:
source: the source taxon that we are interested in, can be in any level.
interaction_type: the interaction type that we are interested in,
should be consistent with the names of interaction types from tsv.file.
rank: the level of target taxon that we are interested in,
should be consistent with the column names of tsv.file, such as 'targetTaxonFamilyName', 'targetTaxonOrderName',
'targetTaxonClassName'...
Returns:
The top target taxons in certain rank with clickable wikipedia links for certain source taxon and certain interaction type,
in descending order of number of records.
"""
top_targets = find_top_target(source, interaction_type, rank)
target_df = pd.DataFrame(top_targets)
target_df.columns = ['count']
urls = dict(name= list(target_df.index),
url= ['https://en.wikipedia.org/wiki/' + str(i) for i in list(target_df.index)])
target_df.index = [i + '#' + j for i,j in zip(urls['name'], urls['url'])]
index_list = list(target_df.index)
target_df.index =[make_clickable_both(i) for i in index_list]
df = target_df.style.format({'wiki': make_clickable_both})
return df
Examples:
In [23]:
top_targets_with_wiki('Homo sapiens', 'eats', 'targetTaxonClassName')
Out[23]:
In [24]:
top_targets_with_wiki('Homo sapiens', 'hostOf', 'targetTaxonFamilyName')
Out[24]:
In [25]:
top_targets_with_wiki('Actinopterygii', 'preysOn', 'targetTaxonClassName')
Out[25]:
Make directed graphs:¶
In [28]:
import networkx as nx
In [38]:
def plot(source, interaction_type, rank, n = None):
""" Function that plots directed graphs of results from 'find_top_target'.
Args:
source: the source taxon that we are interested in, can be in any level.
interaction_type: the interaction type that we are interested in,
should be consistent with the names of interaction types from tsv.file.
rank: the level of target taxon that we are interested in,
should be consistent with the column names of tsv.file, such as 'targetTaxonFamilyName', 'targetTaxonOrderName',
'targetTaxonClassName'...
n: select first n top targets to plot, default to plot all top targets.
Returns:
A directed graph containing information of the source and target taxons, interaction_type
"""
G = nx.DiGraph()
if n:
top_targets = find_top_target(source, interaction_type, rank)[: n]
else:
top_targets = find_top_target(source, interaction_type, rank)
for name in ([source]+ list(top_targets.index)):
G.add_node(name)
for target in top_targets.index:
G.add_edge(source, target, label = interaction_type)
plt.figure(figsize=(8,8))
edge_labels = nx.get_edge_attributes(G,'label')
pos = nx.spring_layout(G)
nx.draw_networkx_edge_labels(G,pos, edge_labels = edge_labels, font_size=15, font_color='orange')
nx.draw_networkx(G, pos, with_labels=True, node_size=1500, node_color="skyblue", alpha= 1, arrows=True,
linewidths=1, font_color="grey", font_size=15, style = 'dashed')
plt.axis('off')
plt.tight_layout()
plt.show()
In [34]:
plot('Homo sapiens', 'eats', 'targetTaxonClassName', 5)
In [35]:
plot('Homo sapiens', 'eats', 'targetTaxonClassName', 10)
In [39]:
plot('Homo sapiens', 'eats', 'targetTaxonClassName')
In [36]:
plot('Homo sapiens', 'hostOf', 'targetTaxonFamilyName', 5)
In [37]:
plot('Actinopterygii', 'preysOn', 'targetTaxonClassName', 5)
In [ ]: