Effect of deferoxamine on the transcriptome of BSF trypanosomes

Michele Tinti & Calvin Tiengwe

set up notebook

In [5]:
#reload when modified
%load_ext autoreload
%autoreload 2
#activate r magic
%load_ext rpy2.ipython
%matplotlib inline
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utilities as UT
import missingno as msno
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import gc

random.seed(1976)
np.random.seed(1976)

Data Anaylsis

Experiment SetUp

In [8]:
from IPython.display import Image
In [ ]:
 
In [9]:
#create a dictionary of gene to desc
#from the gff file
def make_desc(_GFF):
    gff =pd.read_csv( _GFF, sep='\t', header=None, comment='#')
    
    gff = gff[gff.iloc[:,2]=='gene']
    #print( gff[gff[gff.columns[-1]].str.contains('Tb427_020006200')] )
    desc = {}
    for n in gff.iloc[:,-1]:
        n=n.replace('%2C',' ')
        item_list = n.split(';')
        #print (item_list)
        temp_dict = {}
        for m in item_list:
            #print(m)
            temp_dict[m.split('=')[0].strip()]=m.split('=')[1].strip()
        #print(temp_dict['ID'])
        #print(temp_dict['description'])
        desc[temp_dict['ID']]=temp_dict.get('description','none')
    return desc

desc_dict = make_desc('genomes/tb927_3/tb927_3.gff')
desc_dict['Tb10.v4.0073']
Out[9]:
'variant surface glycoprotein (VSG  pseudogene)  putative'

Load Read Counts

In [13]:
exp = '{life_stage}{replica}'
list_df = [exp.format(
    life_stage=life_stage,
    replica=replica) 
 for life_stage in ['C','D','DF','F']
 for replica in ['1','2','3']
            ]
list_df = [n+'/res/'+n+'/counts.txt' for n in list_df]
list_df =[pd.read_csv(n,index_col=[0],comment='#',sep='\t') for n in list_df]
df = list_df[0].copy()
for temp_df in list_df[1:]:
    df = df.join(temp_df.iloc[:,-1])
df.head()
#temp_df = pd.read_csv('BSF/tb927_3_ks_counts_final.txt',index_col=[0],comment='#',sep='\t')
Out[13]:
Chr Start End Strand Length /tmp/2992.1.all.q/C1/C1_sorted.bam /tmp/2993.1.all.q/C2/C2_sorted.bam /tmp/2994.1.all.q/C3/C3_sorted.bam /tmp/2995.1.all.q/D1/D1_sorted.bam /tmp/2996.1.all.q/D2/D2_sorted.bam /tmp/2997.1.all.q/D3/D3_sorted.bam /tmp/2998.1.all.q/DF1/DF1_sorted.bam /tmp/2999.1.all.q/DF2/DF2_sorted.bam /tmp/3000.1.all.q/DF3/DF3_sorted.bam /tmp/3001.1.all.q/F1/F1_sorted.bam /tmp/3002.1.all.q/F2/F2_sorted.bam /tmp/3003.1.all.q/F3/F3_sorted.bam
Geneid
Tb10.v4.0073 tryp_X-188b09.p2kB601 929 1489 + 561 0 0 0 1 1 0 0 2 0 0 0 2
Tb10.v4.0074 tryp_X-188b09.p2kB601 2775 3452 + 678 0 0 0 2 2 4 0 0 0 0 0 1
Tb10.v4.0075 tryp_X-188b09.p2kB601 3781 5223 + 1443 0 0 1 0 2 0 0 0 0 0 0 0
Tb10.v4.0076 tryp_X-188b09.p2kB601 6264 7721 + 1458 0 0 0 0 0 0 0 0 0 0 0 0
Tb10.v4.0077 tryp_X-188b09.p2kB601 9669 10955 + 1287 0 0 0 0 0 0 0 0 0 0 0 0
In [14]:
data_col = df.columns[5:]
data_col
Out[14]:
Index(['/tmp/2992.1.all.q/C1/C1_sorted.bam',
       '/tmp/2993.1.all.q/C2/C2_sorted.bam',
       '/tmp/2994.1.all.q/C3/C3_sorted.bam',
       '/tmp/2995.1.all.q/D1/D1_sorted.bam',
       '/tmp/2996.1.all.q/D2/D2_sorted.bam',
       '/tmp/2997.1.all.q/D3/D3_sorted.bam',
       '/tmp/2998.1.all.q/DF1/DF1_sorted.bam',
       '/tmp/2999.1.all.q/DF2/DF2_sorted.bam',
       '/tmp/3000.1.all.q/DF3/DF3_sorted.bam',
       '/tmp/3001.1.all.q/F1/F1_sorted.bam',
       '/tmp/3002.1.all.q/F2/F2_sorted.bam',
       '/tmp/3003.1.all.q/F3/F3_sorted.bam'],
      dtype='object')
In [15]:
indata = df[data_col]
indata.columns = [n.split('/')[3] for  n in indata.columns]
indata.head()
Out[15]:
C1 C2 C3 D1 D2 D3 DF1 DF2 DF3 F1 F2 F3
Geneid
Tb10.v4.0073 0 0 0 1 1 0 0 2 0 0 0 2
Tb10.v4.0074 0 0 0 2 2 4 0 0 0 0 0 1
Tb10.v4.0075 0 0 1 0 2 0 0 0 0 0 0 0
Tb10.v4.0076 0 0 0 0 0 0 0 0 0 0 0 0
Tb10.v4.0077 0 0 0 0 0 0 0 0 0 0 0 0

labels

  • C - control
  • D - cells incubated with iron chelator deferoxamine
  • DF - iron-presaturated deferoxamine
  • F - FeCl3
In [16]:
print(indata.shape)
indata=indata.dropna()
print(indata.shape)
#indata.loc['KS17gene_1749a']
#indata['desc']=[desc_dict.get(n,'none') for n in indata.index.values]
#indata.to_csv('indata.csv')
#indata.head()
#indata.loc['mainVSG-427-2']
(19949, 12)
(19949, 12)

Missing Data Viz

In [ ]:
 
In [17]:
msno.matrix(indata.replace(0,np.nan).dropna(how='all'),figsize=(6, 4))
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b9910125cf8>
In [18]:
msno.bar(indata.replace(0,np.nan).dropna(how='all'),figsize=(6, 4))
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x2b98f433f0f0>

QC - Corr analysis

In [19]:
!mkdir -p Figures
In [20]:
fig,ax=plt.subplots(figsize=(6,6))
cbar_ax = fig.add_axes([.91, .6, .03, .2])
sns.heatmap(np.log2(indata).corr(),
            #vmin=-1,
            cmap='coolwarm',
            annot=True,linewidths=.5,ax=ax, cbar_ax = cbar_ax, cbar=True)
print(ax.get_ylim())
ax.set_ylim(12,0)
plt.savefig('Figures/Figure_2.png')
plt.show()
(11.5, 0.5)
In [ ]:
 

QC - MSD

In [21]:
plt.style.use('ggplot')
palette = ['r']*3+['b']*3+['g']*3+['y']*3
fig,ax = plt.subplots(figsize=(6,6), ncols=1, nrows=1)
UT.make_mds(np.log2(indata),palette,ax,color_dictionary={'r':'C','b':'D',
                                                        'g':'DF','y':'F',})
plt.savefig('Figures/Figure_3.png')
plt.show()
{'r': 'C', 'b': 'D', 'g': 'DF', 'y': 'F'}
In [20]:
def get_gene_ids(n):
    res = {}
    temp = n.split(';')
    temp =[n.strip() for n in temp if len(n)>2]
    for f in temp:
        key = f.split(' ')[0]
        value = f.split(' ')[1]
        key=key.replace('\"','').replace('\'','').strip()
        value=value.replace('\"','').replace('\'','').strip()
        res[key]=value
    return res['gene_id']

edgeR analysis

In [22]:
%%R -i indata
options(warn=-1)
library("limma") 
library("edgeR")
head(indata)
             C1 C2 C3 D1 D2 D3 DF1 DF2 DF3 F1 F2 F3
Tb10.v4.0073  0  0  0  1  1  0   0   2   0  0  0  2
Tb10.v4.0074  0  0  0  2  2  4   0   0   0  0  0  1
Tb10.v4.0075  0  0  1  0  2  0   0   0   0  0  0  0
Tb10.v4.0076  0  0  0  0  0  0   0   0   0  0  0  0
Tb10.v4.0077  0  0  0  0  0  0   0   0   0  0  0  0
Tb10.v4.0078 39 23 33 37 42 48  32  32  30 30 23 15
In [23]:
%%R
group <- factor(c(
    'C','C','C',
    'D','D','D',
    'DF','DF','DF',
    'F','F','F'
))

design_with_all <- model.matrix( ~0+group )

y <- DGEList(counts=indata,group=group)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
counts = y$counts
genes = row.names(y)

y <- calcNormFactors(y)
y <- estimateDisp(y, design_with_all)
fit_all <- glmQLFit( y, design_with_all )

Differential Expression Analysis

D vs C

In [28]:
%%R
contrastDC <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-groupC, levels=design_with_all ) )
tableDC <- topTags(contrastDC, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableDC)
                   logFC     logCPM          F     PValue       FDR
Tb10.v4.0078  0.36148673 -0.6020157 2.68740696 0.12094164 0.3191780
Tb927.8.150   0.35003212 -0.4258011 4.88268749 0.06721565 0.2208936
Tb927.8.270   0.17421418  3.7407194 5.15640114 0.03754798 0.1547819
Tb927.8.320   0.08999977 -2.3199476 0.08255392 0.80908358 0.9075300
Tb927.8.443  -0.18502367  3.5739108 3.02111267 0.10167460 0.2851589
Tb927.8.449  -0.37536786 -1.4475018 1.71968473 0.20851127 0.4369400

D vs DF

In [29]:
%%R
contrastDDF <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-groupDF, levels=design_with_all ) )
tableDDF <- topTags(contrastDDF, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableDDF)
                  logFC     logCPM          F       PValue         FDR
Tb10.v4.0078  0.3784196 -0.6020157  2.9319964 0.1064341983 0.225654034
Tb927.8.150   0.2808284 -0.4258011  3.2087075 0.1376720316 0.269033700
Tb927.8.270   0.1533285  3.7407194  4.0044381 0.0629019906 0.155677022
Tb927.8.320   0.1968926 -2.3199476  0.3817972 0.6023590411 0.737595008
Tb927.8.443  -0.4262638  3.5739108 16.4397658 0.0009455344 0.008225338
Tb927.8.449  -0.2930562 -1.4475018  1.0220519 0.3272974035 0.490872070

D vs F

In [30]:
%%R
contrastDF <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-groupF, levels=design_with_all ) )
tableDF <- topTags(contrastDF, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableDF)
                  logFC     logCPM         F      PValue        FDR
Tb10.v4.0078  0.8324422 -0.6020157 12.346826 0.002934113 0.01709656
Tb927.8.150   0.2277149 -0.4258011  2.134789 0.224175641 0.37328426
Tb927.8.270   0.2008321  3.7407194  6.825606 0.019018155 0.06491963
Tb927.8.320   0.9649664 -2.3199476  7.093722 0.025627488 0.08022788
Tb927.8.443  -0.3843780  3.5739108 13.300917 0.002218442 0.01415545
Tb927.8.449  -0.3042787 -1.4475018  1.101866 0.309681874 0.46662636

F vs C

In [31]:
%%R
contrastFC <- glmQLFTest( fit_all, contrast=makeContrasts( groupF-groupC, levels=design_with_all ) )
tableFC <- topTags(contrastFC, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableFC)
                   logFC     logCPM          F     PValue       FDR
Tb10.v4.0078 -0.47095551 -0.6020157 3.51023878 0.07964860 0.2300972
Tb927.8.150   0.12231717 -0.4258011 0.54905268 0.54387645 0.7225729
Tb927.8.270  -0.02661789  3.7407194 0.11721848 0.73659272 0.8546985
Tb927.8.320  -0.87496659 -2.3199476 5.57536346 0.04806519 0.1724157
Tb927.8.443   0.19935435  3.5739108 3.63532663 0.07495816 0.2230052
Tb927.8.449  -0.07108915 -1.4475018 0.06642519 0.79994842 0.8915675

D vs (DF+C+F)/3

In [32]:
%%R
contrast_mine <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-(groupDF+groupC+groupF)/3, levels=design_with_all ) )
table_mine <- topTags(contrast_mine, n=Inf, sort.by = "none", adjust.method="BH")$table
head(table_mine)
                  logFC     logCPM         F      PValue        FDR
Tb10.v4.0078  0.5241162 -0.6020157  8.421475 0.010520086 0.04184748
Tb927.8.150   0.2861918 -0.4258011  5.126944 0.060484573 0.14931769
Tb927.8.270   0.1761249  3.7407194  8.065740 0.011953132 0.04597633
Tb927.8.320   0.4172862 -2.3199476  2.462648 0.187151749 0.33518272
Tb927.8.443  -0.3318885  3.5739108 14.386165 0.001633858 0.01108790
Tb927.8.449  -0.3242343 -1.4475018  1.871685 0.190465532 0.33913963
In [33]:
%R -o tableDC,tableDDF,tableDF,tableFC,table_mine

def mod_table(intable):
    intable['mlog10FDR']=-np.log10(intable['FDR'])
    intable['mlog10pvalue']=-np.log10(intable['PValue'])
    intable['desc']=[desc_dict.get(n,n) for n in intable.index.values]
    return intable

tableDC = mod_table(tableDC)
tableDDF = mod_table(tableDDF)
tableDF = mod_table(tableDF)
tableFC = mod_table(tableFC)
table_mine = mod_table(table_mine)

Exploratory Pltots

In [34]:
for table,name in zip([tableDC,tableDDF,tableDF,tableFC,table_mine],
                 ['DvsC','DvsDF','DvsF','FvsC','Dvs(C/DF/F)']):

    fig,axes=plt.subplots(figsize=(14,4), ncols=2, nrows=1)
    ax=axes[0]
    table.plot(x='logFC',y='mlog10FDR',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_xlim(-2.5,2.5)
    ax.set_title('Volcano')
    ax=axes[1]
    table.plot(x='logCPM',y='logFC',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_ylim(-2.5,2.5)
    ax.set_title('MA')
    plt.suptitle(name)
    plt.show()
In [35]:
def annotated_volcano(table,name,selection=False):
    
    fig,axes=plt.subplots(figsize=(8,6), ncols=1, nrows=1)
    ax=axes
    if not selection:
        selection = table[((table['logFC']>1)|(table['logFC']<-1))
                          &(table['mlog10FDR']>2)&(table['logCPM']>2)]
    else:
        selection = table.loc[selection]
    annot_index = selection.index.values
    annot_names = selection['desc']
    UT.make_vulcano(
        table,
        ax,
        x='logFC',
        y='mlog10FDR',
        fc_col = 'logFC',
        pval_col = 'FDR',
        pval_limit=0.01,
        fc_limit=1,
        annot_index=annot_index ,
        annot_names=annot_names)
    ax.legend(loc='upper center', bbox_to_anchor=(0.8, 1.2), title='Legend')
    ax.set_xlim(-3,3)
    plt.title(name)

Volcano Plots

In [37]:
for table,name in zip([tableDC,tableDDF,tableDF,tableFC,table_mine],
                 ['DvsC','DvsDF','DvsF','FvsC','Dvs(C/DF/F)']):
    annotated_volcano(table,name)
No handles with labels found to put in legend.
No handles with labels found to put in legend.
In [41]:
table_dict = {}
for  table,name  in zip([tableDC,tableDDF,tableDF],['tableDC','tableDDF','tableDF']):
                    
    selection = table[((table['logFC']>1)|(table['logFC']<-1))
                          &(table['mlog10FDR']>2)&(table['logCPM']>1)]
    
    table_dict[name]= list(selection.index.values)

Upset Plots

To identify differentially regulated genes in common between the conditions

In [42]:
from upsetplot import plot,UpSet
from upsetplot import from_contents
import matplotlib
data_upset = from_contents(table_dict)
plot(data_upset)
fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()
In [46]:
#list of common genes
common = set(table_dict['tableDC']) & set(table_dict['tableDDF']) & set(table_dict['tableDF'])
common
Out[46]:
{'KS17gene_4150a',
 'TRY.133',
 'Tb427.BES129.2',
 'Tb427.BES129.3',
 'Tb427.BES129.4',
 'Tb427VSG-1559',
 'Tb427VSG-6609',
 'Tb927.11.12100'}

Visulize the common proteins

In [45]:
for table,name in zip([tableDC,tableDDF,tableDF,tableFC,table_mine],
                 ['DvsC','DvsDF','DvsF','FvsC','Dvs(C/DF/F)']):
    annotated_volcano(table,name,selection=list(common))
No handles with labels found to put in legend.
No handles with labels found to put in legend.
In [47]:
Image('Figures/Tb927.11.12100_paperFig.png')
Out[47]:
In [50]:
Image('Figures/Tb427.BES129.3_paperFig.png')
Out[50]:
In [53]:
Image('Figures/TRY.133_paperFigSL.png')
Out[53]:
In [ ]:
!jupyter nbconvert --to html_toc analysis_def.ipynb