Effect of deferoxamine on the transcriptome of BSF trypanosomes¶

Michele Tinti & Calvin Tiengwe

set up notebook¶

#reload when modified
%load_ext autoreload
%autoreload 2
#activate r magic
%load_ext rpy2.ipython
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utilities as UT
import missingno as msno
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import gc

random.seed(1976)
np.random.seed(1976)

Data Anaylsis¶

Experiment SetUp¶

from IPython.display import Image

#create a dictionary of gene to desc
#from the gff file
def make_desc(_GFF):
    gff =pd.read_csv( _GFF, sep='\t', header=None, comment='#')
    
    gff = gff[gff.iloc[:,2]=='gene']
    #print( gff[gff[gff.columns[-1]].str.contains('Tb427_020006200')] )
    desc = {}
    for n in gff.iloc[:,-1]:
        n=n.replace('%2C',' ')
        item_list = n.split(';')
        #print (item_list)
        temp_dict = {}
        for m in item_list:
            #print(m)
            temp_dict[m.split('=')[0].strip()]=m.split('=')[1].strip()
        #print(temp_dict['ID'])
        #print(temp_dict['description'])
        desc[temp_dict['ID']]=temp_dict.get('description','none')
    return desc

desc_dict = make_desc('genomes/tb927_3/tb927_3.gff')
desc_dict['Tb10.v4.0073']

'variant surface glycoprotein (VSG  pseudogene)  putative'

Load Read Counts¶

exp = '{life_stage}{replica}'
list_df = [exp.format(
    life_stage=life_stage,
    replica=replica) 
 for life_stage in ['C','D','DF','F']
 for replica in ['1','2','3']
            ]
list_df = [n+'/res/'+n+'/counts.txt' for n in list_df]
list_df =[pd.read_csv(n,index_col=[0],comment='#',sep='\t') for n in list_df]
df = list_df[0].copy()
for temp_df in list_df[1:]:
    df = df.join(temp_df.iloc[:,-1])
df.head()
#temp_df = pd.read_csv('BSF/tb927_3_ks_counts_final.txt',index_col=[0],comment='#',sep='\t')

data_col = df.columns[5:]
data_col

Index(['/tmp/2992.1.all.q/C1/C1_sorted.bam',
       '/tmp/2993.1.all.q/C2/C2_sorted.bam',
       '/tmp/2994.1.all.q/C3/C3_sorted.bam',
       '/tmp/2995.1.all.q/D1/D1_sorted.bam',
       '/tmp/2996.1.all.q/D2/D2_sorted.bam',
       '/tmp/2997.1.all.q/D3/D3_sorted.bam',
       '/tmp/2998.1.all.q/DF1/DF1_sorted.bam',
       '/tmp/2999.1.all.q/DF2/DF2_sorted.bam',
       '/tmp/3000.1.all.q/DF3/DF3_sorted.bam',
       '/tmp/3001.1.all.q/F1/F1_sorted.bam',
       '/tmp/3002.1.all.q/F2/F2_sorted.bam',
       '/tmp/3003.1.all.q/F3/F3_sorted.bam'],
      dtype='object')

indata = df[data_col]
indata.columns = [n.split('/')[3] for  n in indata.columns]
indata.head()

labels¶

C - control
D - cells incubated with iron chelator deferoxamine
DF - iron-presaturated deferoxamine
F - FeCl3

print(indata.shape)
indata=indata.dropna()
print(indata.shape)
#indata.loc['KS17gene_1749a']
#indata['desc']=[desc_dict.get(n,'none') for n in indata.index.values]
#indata.to_csv('indata.csv')
#indata.head()
#indata.loc['mainVSG-427-2']

(19949, 12)
(19949, 12)

Missing Data Viz¶

msno.matrix(indata.replace(0,np.nan).dropna(how='all'),figsize=(6, 4))

<matplotlib.axes._subplots.AxesSubplot at 0x2b9910125cf8>

msno.bar(indata.replace(0,np.nan).dropna(how='all'),figsize=(6, 4))

<matplotlib.axes._subplots.AxesSubplot at 0x2b98f433f0f0>

QC - Corr analysis¶

!mkdir -p Figures

fig,ax=plt.subplots(figsize=(6,6))
cbar_ax = fig.add_axes([.91, .6, .03, .2])
sns.heatmap(np.log2(indata).corr(),
            #vmin=-1,
            cmap='coolwarm',
            annot=True,linewidths=.5,ax=ax, cbar_ax = cbar_ax, cbar=True)
print(ax.get_ylim())
ax.set_ylim(12,0)
plt.savefig('Figures/Figure_2.png')
plt.show()

(11.5, 0.5)

QC - MSD¶

plt.style.use('ggplot')
palette = ['r']*3+['b']*3+['g']*3+['y']*3
fig,ax = plt.subplots(figsize=(6,6), ncols=1, nrows=1)
UT.make_mds(np.log2(indata),palette,ax,color_dictionary={'r':'C','b':'D',
                                                        'g':'DF','y':'F',})
plt.savefig('Figures/Figure_3.png')
plt.show()

{'r': 'C', 'b': 'D', 'g': 'DF', 'y': 'F'}

def get_gene_ids(n):
    res = {}
    temp = n.split(';')
    temp =[n.strip() for n in temp if len(n)>2]
    for f in temp:
        key = f.split(' ')[0]
        value = f.split(' ')[1]
        key=key.replace('\"','').replace('\'','').strip()
        value=value.replace('\"','').replace('\'','').strip()
        res[key]=value
    return res['gene_id']

edgeR analysis¶

%%R -i indata
options(warn=-1)
library("limma") 
library("edgeR")
head(indata)

             C1 C2 C3 D1 D2 D3 DF1 DF2 DF3 F1 F2 F3
Tb10.v4.0073  0  0  0  1  1  0   0   2   0  0  0  2
Tb10.v4.0074  0  0  0  2  2  4   0   0   0  0  0  1
Tb10.v4.0075  0  0  1  0  2  0   0   0   0  0  0  0
Tb10.v4.0076  0  0  0  0  0  0   0   0   0  0  0  0
Tb10.v4.0077  0  0  0  0  0  0   0   0   0  0  0  0
Tb10.v4.0078 39 23 33 37 42 48  32  32  30 30 23 15

%%R
group <- factor(c(
    'C','C','C',
    'D','D','D',
    'DF','DF','DF',
    'F','F','F'
))

design_with_all <- model.matrix( ~0+group )

y <- DGEList(counts=indata,group=group)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
counts = y$counts
genes = row.names(y)

y <- calcNormFactors(y)
y <- estimateDisp(y, design_with_all)
fit_all <- glmQLFit( y, design_with_all )

Differential Expression Analysis¶

D vs C¶

%%R
contrastDC <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-groupC, levels=design_with_all ) )
tableDC <- topTags(contrastDC, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableDC)

                   logFC     logCPM          F     PValue       FDR
Tb10.v4.0078  0.36148673 -0.6020157 2.68740696 0.12094164 0.3191780
Tb927.8.150   0.35003212 -0.4258011 4.88268749 0.06721565 0.2208936
Tb927.8.270   0.17421418  3.7407194 5.15640114 0.03754798 0.1547819
Tb927.8.320   0.08999977 -2.3199476 0.08255392 0.80908358 0.9075300
Tb927.8.443  -0.18502367  3.5739108 3.02111267 0.10167460 0.2851589
Tb927.8.449  -0.37536786 -1.4475018 1.71968473 0.20851127 0.4369400

D vs DF¶

%%R
contrastDDF <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-groupDF, levels=design_with_all ) )
tableDDF <- topTags(contrastDDF, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableDDF)

                  logFC     logCPM          F       PValue         FDR
Tb10.v4.0078  0.3784196 -0.6020157  2.9319964 0.1064341983 0.225654034
Tb927.8.150   0.2808284 -0.4258011  3.2087075 0.1376720316 0.269033700
Tb927.8.270   0.1533285  3.7407194  4.0044381 0.0629019906 0.155677022
Tb927.8.320   0.1968926 -2.3199476  0.3817972 0.6023590411 0.737595008
Tb927.8.443  -0.4262638  3.5739108 16.4397658 0.0009455344 0.008225338
Tb927.8.449  -0.2930562 -1.4475018  1.0220519 0.3272974035 0.490872070

D vs F¶

%%R
contrastDF <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-groupF, levels=design_with_all ) )
tableDF <- topTags(contrastDF, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableDF)

                  logFC     logCPM         F      PValue        FDR
Tb10.v4.0078  0.8324422 -0.6020157 12.346826 0.002934113 0.01709656
Tb927.8.150   0.2277149 -0.4258011  2.134789 0.224175641 0.37328426
Tb927.8.270   0.2008321  3.7407194  6.825606 0.019018155 0.06491963
Tb927.8.320   0.9649664 -2.3199476  7.093722 0.025627488 0.08022788
Tb927.8.443  -0.3843780  3.5739108 13.300917 0.002218442 0.01415545
Tb927.8.449  -0.3042787 -1.4475018  1.101866 0.309681874 0.46662636

F vs C¶

%%R
contrastFC <- glmQLFTest( fit_all, contrast=makeContrasts( groupF-groupC, levels=design_with_all ) )
tableFC <- topTags(contrastFC, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableFC)

                   logFC     logCPM          F     PValue       FDR
Tb10.v4.0078 -0.47095551 -0.6020157 3.51023878 0.07964860 0.2300972
Tb927.8.150   0.12231717 -0.4258011 0.54905268 0.54387645 0.7225729
Tb927.8.270  -0.02661789  3.7407194 0.11721848 0.73659272 0.8546985
Tb927.8.320  -0.87496659 -2.3199476 5.57536346 0.04806519 0.1724157
Tb927.8.443   0.19935435  3.5739108 3.63532663 0.07495816 0.2230052
Tb927.8.449  -0.07108915 -1.4475018 0.06642519 0.79994842 0.8915675

D vs (DF+C+F)/3¶

%%R
contrast_mine <- glmQLFTest( fit_all, contrast=makeContrasts( groupD-(groupDF+groupC+groupF)/3, levels=design_with_all ) )
table_mine <- topTags(contrast_mine, n=Inf, sort.by = "none", adjust.method="BH")$table
head(table_mine)

                  logFC     logCPM         F      PValue        FDR
Tb10.v4.0078  0.5241162 -0.6020157  8.421475 0.010520086 0.04184748
Tb927.8.150   0.2861918 -0.4258011  5.126944 0.060484573 0.14931769
Tb927.8.270   0.1761249  3.7407194  8.065740 0.011953132 0.04597633
Tb927.8.320   0.4172862 -2.3199476  2.462648 0.187151749 0.33518272
Tb927.8.443  -0.3318885  3.5739108 14.386165 0.001633858 0.01108790
Tb927.8.449  -0.3242343 -1.4475018  1.871685 0.190465532 0.33913963

%R -o tableDC,tableDDF,tableDF,tableFC,table_mine

def mod_table(intable):
    intable['mlog10FDR']=-np.log10(intable['FDR'])
    intable['mlog10pvalue']=-np.log10(intable['PValue'])
    intable['desc']=[desc_dict.get(n,n) for n in intable.index.values]
    return intable

tableDC = mod_table(tableDC)
tableDDF = mod_table(tableDDF)
tableDF = mod_table(tableDF)
tableFC = mod_table(tableFC)
table_mine = mod_table(table_mine)

Exploratory Pltots¶

for table,name in zip([tableDC,tableDDF,tableDF,tableFC,table_mine],
                 ['DvsC','DvsDF','DvsF','FvsC','Dvs(C/DF/F)']):

    fig,axes=plt.subplots(figsize=(14,4), ncols=2, nrows=1)
    ax=axes[0]
    table.plot(x='logFC',y='mlog10FDR',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_xlim(-2.5,2.5)
    ax.set_title('Volcano')
    ax=axes[1]
    table.plot(x='logCPM',y='logFC',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_ylim(-2.5,2.5)
    ax.set_title('MA')
    plt.suptitle(name)
    plt.show()

def annotated_volcano(table,name,selection=False):
    
    fig,axes=plt.subplots(figsize=(8,6), ncols=1, nrows=1)
    ax=axes
    if not selection:
        selection = table[((table['logFC']>1)|(table['logFC']<-1))
                          &(table['mlog10FDR']>2)&(table['logCPM']>2)]
    else:
        selection = table.loc[selection]
    annot_index = selection.index.values
    annot_names = selection['desc']
    UT.make_vulcano(
        table,
        ax,
        x='logFC',
        y='mlog10FDR',
        fc_col = 'logFC',
        pval_col = 'FDR',
        pval_limit=0.01,
        fc_limit=1,
        annot_index=annot_index ,
        annot_names=annot_names)
    ax.legend(loc='upper center', bbox_to_anchor=(0.8, 1.2), title='Legend')
    ax.set_xlim(-3,3)
    plt.title(name)

Volcano Plots¶

for table,name in zip([tableDC,tableDDF,tableDF,tableFC,table_mine],
                 ['DvsC','DvsDF','DvsF','FvsC','Dvs(C/DF/F)']):
    annotated_volcano(table,name)

No handles with labels found to put in legend.
No handles with labels found to put in legend.

table_dict = {}
for  table,name  in zip([tableDC,tableDDF,tableDF],['tableDC','tableDDF','tableDF']):
                    
    selection = table[((table['logFC']>1)|(table['logFC']<-1))
                          &(table['mlog10FDR']>2)&(table['logCPM']>1)]
    
    table_dict[name]= list(selection.index.values)

Upset Plots¶

To identify differentially regulated genes in common between the conditions

from upsetplot import plot,UpSet
from upsetplot import from_contents
import matplotlib
data_upset = from_contents(table_dict)
plot(data_upset)
fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

#list of common genes
common = set(table_dict['tableDC']) & set(table_dict['tableDDF']) & set(table_dict['tableDF'])
common

{'KS17gene_4150a',
 'TRY.133',
 'Tb427.BES129.2',
 'Tb427.BES129.3',
 'Tb427.BES129.4',
 'Tb427VSG-1559',
 'Tb427VSG-6609',
 'Tb927.11.12100'}

Visulize the common proteins¶

for table,name in zip([tableDC,tableDDF,tableDF,tableFC,table_mine],
                 ['DvsC','DvsDF','DvsF','FvsC','Dvs(C/DF/F)']):
    annotated_volcano(table,name,selection=list(common))

No handles with labels found to put in legend.
No handles with labels found to put in legend.

Image('Figures/Tb927.11.12100_paperFig.png')

Image('Figures/Tb427.BES129.3_paperFig.png')

Image('Figures/TRY.133_paperFigSL.png')

!jupyter nbconvert --to html_toc analysis_def.ipynb

	Chr	Start	End	Strand	Length	/tmp/2992.1.all.q/C1/C1_sorted.bam	/tmp/2993.1.all.q/C2/C2_sorted.bam	/tmp/2994.1.all.q/C3/C3_sorted.bam	/tmp/2995.1.all.q/D1/D1_sorted.bam	/tmp/2996.1.all.q/D2/D2_sorted.bam	/tmp/2997.1.all.q/D3/D3_sorted.bam	/tmp/2998.1.all.q/DF1/DF1_sorted.bam	/tmp/2999.1.all.q/DF2/DF2_sorted.bam	/tmp/3000.1.all.q/DF3/DF3_sorted.bam	/tmp/3001.1.all.q/F1/F1_sorted.bam	/tmp/3002.1.all.q/F2/F2_sorted.bam	/tmp/3003.1.all.q/F3/F3_sorted.bam
Geneid
Tb10.v4.0073	tryp_X-188b09.p2kB601	929	1489	+	561	0	0	0	1	1	0	0	2	0	0	0	2
Tb10.v4.0074	tryp_X-188b09.p2kB601	2775	3452	+	678	0	0	0	2	2	4	0	0	0	0	0	1
Tb10.v4.0075	tryp_X-188b09.p2kB601	3781	5223	+	1443	0	0	1	0	2	0	0	0	0	0	0	0
Tb10.v4.0076	tryp_X-188b09.p2kB601	6264	7721	+	1458	0	0	0	0	0	0	0	0	0	0	0	0
Tb10.v4.0077	tryp_X-188b09.p2kB601	9669	10955	+	1287	0	0	0	0	0	0	0	0	0	0	0	0