Hot Start Vignette¶
Imports¶
# basic imports
import os
import pandas as pd
import numpy as np
import IPython
# tcrdist classes
from tcrdist.repertoire import TCRrep
from tcrdist.subset import TCRsubset
from tcrdist.cdr3_motif import TCRMotif
from tcrdist.storage import StoreIOMotif, StoreIOEntropy
# tcrdist functions
from tcrdist import plotting
from tcrdist.mappers import populate_legacy_fields
# scipy functions for clustering
from scipy.spatial import distance
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
# sklearn functions for low-dimensional embeddings
from sklearn.manifold import TSNE, MDS
# plotnine to allow grammar of graphics plotting akin to R's ggplot2
import plotnine as gg
TCRrep
¶
Download repertoire data: (dash.csv 375KB).
subject | epitope | count | v_a_gene | j_a_gene | cdr3_a_aa | cdr3_a_nucseq | v_b_gene | j_b_gene | cdr3_b_aa | cdr3_b_nucseq | clone_id |
---|---|---|---|---|---|---|---|---|---|---|---|
mouse_subject0050 | PA | 2 | TRAV7-3*01 | TRAJ33*01 | CAVSLDSNYQLIW | tgtgcagtgagcctcgatagcaactatcagttgatctgg | TRBV13-1*01 | TRBJ2-3*01 | CASSDFDWGGDAETLYF | tgtgccagcagtgatttcgactggggaggggatgcagaaacgctgtatttt | mouse_tcr0072.clone |
mouse_subject0050 | PA | 6 | TRAV6D-6*01 | TRAJ56*01 | CALGDRATGGNNKLTF | tgtgctctgggtgacagggctactggaggcaataataagctgactttt | TRBV29*01 | TRBJ1-1*01 | CASSPDRGEVFF | tgtgctagcagtccggacaggggtgaagtcttcttt | mouse_tcr0096.clone |
mouse_subject0050 | PA | 1 | TRAV6D-6*01 | TRAJ49*01 | CALGSNTGYQNFYF | tgtgctctgggctcgaacacgggttaccagaacttctatttt | TRBV29*01 | TRBJ1-5*01 | CASTGGGAPLF | tgtgctagcacagggggaggggctccgcttttt | mouse_tcr0276.clone |
Calculate pairwise TCR distances between all clones in the repertoire.
#1 load data, subset to receptors recognizing "PA" epitope
tcrdist2_df = pd.read_csv(os.path.join("dash.csv"))
tcrdist2_df = tcrdist2_df[tcrdist2_df.epitope == "PA"].copy()
#2 create instance of TCRrep class, initializes input as tr.cell_df attribute
tr = TCRrep(cell_df = tcrdist2_df, chains = ['alpha','beta'],organism = "mouse")
#3 Infer CDR1,CDR2,CDR2.5 (a.k.a. phmc) from germline v-genes
tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
tr.infer_cdrs_from_v_gene(chain = 'beta', imgt_aligned=True)
#4 Define index columns for determining unique clones.
tr.index_cols = ['clone_id', 'subject', 'epitope',
'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene',
'cdr3_a_aa', 'cdr3_b_aa',
'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
'cdr3_b_nucseq', 'cdr3_a_nucseq']
#4 Deduplicate based on index cols, creating tr.clone_df attribute
tr.deduplicate()
#5 calculate tcrdists by method in Dash et al.
tr._tcrdist_legacy_method_alpha_beta()
#6 Check that sum of alpah-chain and beta-chain distance matrices equal paired_tcrdist
distA = tr.dist_a
distB = tr.dist_b
assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)
Hier. Cluster TCR-Distances¶
The pairwise TCR distance matrix calculated by
tcrdist2.repertoire.TCRrep
(i.e., accessible as tr.paired_tcrdist
) can serve
as the input for scipy.cluster.hierarchy
tools or your preferred python
library for unsupervised clustering of sequences within the repertoire.
The rows of tr.clone_df
match the order of rows in
tr.paired_distance
. Thus, any cluster_index
you produce, must
match length and order of tr.clone_df
.
from scipy.spatial import distance
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
compressed_dmat = distance.squareform(tr.paired_tcrdist, force = "vector")
Z = linkage(compressed_dmat, method = "complete")
den = dendrogram(Z, color_threshold = np.inf, no_plot = True)
cluster_index = fcluster(Z, t = 20, criterion = "maxclust")
assert len(cluster_index) == tr.clone_df.shape[0]
assert len(cluster_index) == tr.paired_tcrdist.shape[0]
tr.clone_df['cluster_index'] = cluster_index
2D Embeddings¶
The pairwise TCR distance matrix calculated by
tcrdist2.repertoire.TCRrep
(i.e., accessible as tr.paired_tcrdist
) can be used
as input for embedding the pairwise dissimilarity matrix in fewer dimensions.
Here we provide an example using sklearn
to perform tSNE and MDS.
import numpy as np
from sklearn.manifold import TSNE, MDS
X = tr.paired_tcrdist
X_embedded = TSNE(n_components=2).fit_transform(X)
tsne_df = pd.DataFrame(X_embedded, columns = ["tSNE1","tSNE2"])
tsne_df['ci'] = cluster_index
tsne_df['ci'] = tsne_df['ci'].astype('category')
tsne_df['epitope'] = tr.clone_df.epitope.astype('category')
X_embedded_mds = MDS(n_components=2, dissimilarity='precomputed').fit_transform(X)
mds_df = pd.DataFrame(X_embedded_mds, columns = ["MDS1","MDS2"])
mds_df['ci'] = cluster_index
mds_df['ci'] = mds_df['ci'].astype('category')
mds_df['epitope'] = tr.clone_df.epitope.astype('category')
def gg_embed_plot(df,
xvar = 'tSNE1',
yvar = 'tSNE2'):
"""
Function for plotting embedding using plotnine
Parameters
----------
df : pandas.DataFrame
xvar : str
column with data to be plotted along the x-axis
yvar : str
column with data to be plotted along the y-axis
Returns
-------
gg_em_plot : plotnine.ggplot.ggplot
"""
import plotnine as gg
gg_em_plt = ( gg.ggplot(df)
+ gg.aes(x = xvar,
y = yvar,
col = "ci",
label = 'ci',
fill = "ci",
shape = "epitope")
+ gg.geom_point(size = 3, alpha = .5)
+ gg.geom_text(size = 5)
+ gg.theme_minimal())
return(gg_em_plt)
tSNE¶
gg_embed_plot(df = tsne_df, xvar = "tSNE1", yvar="tSNE2")
MDS¶
gg_embed_plot(df = mds_df, xvar = "MDS1", yvar="MDS2")
TCRsubset
¶
Consider cluster 5 of the PA-reactive repertoire in the plot above. Using tcrdist2 we can search
for motifs in any portion of the repertoire by specifying a criteria and initializing
TCRsubset
with a subset of the data as shown below.
# define a logical criteria
criteria = (cluster_index == 5)
# subset the TCRrep clone DataFrame to only those sequences meeting that criteria
clone_df_subset = tr.clone_df[criteria]
clone_df_subset = clone_df_subset[clone_df_subset.epitope == "PA"].copy()
# subset the alpha chain and beta chain distance matrices using the `clone_df_subset.clone_id` index
dist_a_subset = tr.dist_a.loc[clone_df_subset.clone_id, clone_df_subset.clone_id].copy()
dist_b_subset = tr.dist_b.loc[clone_df_subset.clone_id, clone_df_subset.clone_id].copy()
# use the populate_legacy_fields function to add some columns needed for compatability with tcrdist1
clone_df_subset = populate_legacy_fields(df = clone_df_subset, chains =['alpha', 'beta'])
# initialize an instance of the TCRsubset class.
ts = TCRsubset(clone_df_subset,
organism = "mouse",
epitopes = ["PA"] ,
epitope = "PA",
chains = ["A","B"],
dist_a = dist_a_subset,
dist_b = dist_b_subset)
read 169087 A-chains from /Users/kmayerbl/TCRDIST/tcrdist2/tcrdist/db/alphabeta_db.tsv_files/new_nextgen_chains_mouse_A.tsv
read 1947545 B-chains from /Users/kmayerbl/TCRDIST/tcrdist2/tcrdist/db/alphabeta_db.tsv_files/new_nextgen_chains_mouse_B.tsv
Plot Gene Usage¶
Gene Usage in the Repertoire¶
Consider the gene usage of the whole PA-reactive repertoire.
from tcrdist import plotting
gene_usage_repertoire_svg = plotting.plot_pairings(cell_df = tr.clone_df,
cols = ['j_a_gene',
'v_a_gene',
'v_b_gene',
'j_b_gene'],
count_col='count')
IPython.display.SVG(data=gene_usage_repertoire_svg)
Gene Usage in the Subset¶
Consider the gene usage in cluster 5 of the PA-reactive repertoire.
gene_usage_subset_svg = plotting.plot_pairings(cell_df = ts.clone_df,
cols = ['j_a_gene',
'v_a_gene',
'v_b_gene',
'j_b_gene'],
count_col='count')
IPython.display.SVG(data=gene_usage_subset_svg)
Discover Motifs¶
This process may take a few minutes depending on the size of the cluster selected.
if os.path.isfile("dash_PA_cluster_5_motifs.csv"):
ts.motif_df = pd.read_csv("dash_PA_cluster_5_motifs.csv")
else:
motif_df = ts.find_motif()
Save Discovered Motifs¶
ts.motif_df.to_csv("dash_PA_cluster_5_motifs.csv", index = False)
Plot Motifs¶
motif_list_a = list()
motif_logos_a = list()
for i,row in ts.motif_df[ts.motif_df.ab == "A"].iterrows():
StoreIOMotif_instance = ts.eval_motif(row)
motif_list_a.append(StoreIOMotif_instance)
motif_logos_a.append(plotting.plot_pwm(StoreIOMotif_instance, create_file = False, my_height = 200, my_width = 600))
motif_list_b = list()
motif_logos_b = list()
for i,row in ts.motif_df[ts.motif_df.ab == "B"].iterrows():
StoreIOMotif_instance = ts.eval_motif(row)
motif_list_b.append(StoreIOMotif_instance)
motif_logos_b.append(plotting.plot_pwm(StoreIOMotif_instance, create_file = False, my_height = 200, my_width = 600))
Top Alpha-Chain Motif¶
You can visualize the ith motifs by changing the index [0] to [i] in the code below.
IPython.display.SVG(motif_logos_a[0])
Top Beta-Chain Motif¶
IPython.display.SVG(motif_logos_b[0])
All Code in One Block¶
# All code in one block, minus plotting.
# basic imports
import os
import pandas as pd
import numpy as np
import IPython
# tcrdist classes
from tcrdist.repertoire import TCRrep
from tcrdist.subset import TCRsubset
from tcrdist.cdr3_motif import TCRMotif
from tcrdist.storage import StoreIOMotif, StoreIOEntropy
# tcrdist functions
from tcrdist import plotting
from tcrdist.mappers import populate_legacy_fields
# scipy functions for clustering
from scipy.spatial import distance
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
# sklearn functions for low-dimensional embeddings
from sklearn.manifold import TSNE, MDS
# plotnine to allow grammar of graphics plotting akin to R's ggplot2
import plotnine as gg
#1 load data, subset to receptors recognizing "PA" epitope
tcrdist2_df = pd.read_csv(os.path.join("tcrdist",""test_files_compact","dash.csv"))
tcrdist2_df = tcrdist2_df[tcrdist2_df.epitope == "PA"].copy()
#2 create instance of TCRrep class, initializes input as tr.cell_df attribute
tr = TCRrep(cell_df = tcrdist2_df, chains = ['alpha','beta'],organism = "mouse")
#3 Infer CDR1,CDR2,CDR2.5 (a.k.a. phmc) from germline v-genes
tr.infer_cdrs_from_v_gene(chain = 'alpha', imgt_aligned=True)
tr.infer_cdrs_from_v_gene(chain = 'beta', imgt_aligned=True)
#4 Define index columns for determining unique clones.
tr.index_cols = ['clone_id', 'subject', 'epitope',
'v_a_gene', 'j_a_gene', 'v_b_gene', 'j_b_gene',
'cdr3_a_aa', 'cdr3_b_aa',
'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa',
'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa',
'cdr3_b_nucseq', 'cdr3_a_nucseq']
#4 Deduplicate based on index cols, creating tr.clone_df attribute
tr.deduplicate()
#5 calculate tcrdists by method in Dash et al.
tr._tcrdist_legacy_method_alpha_beta()
#6 Check that sum of alpah-chain and beta-chain distance matrices equal paired_tcrdist
distA = tr.dist_a
distB = tr.dist_b
assert np.all(((distA + distB) - tr.paired_tcrdist) == 0)
# Cluster
from scipy.spatial import distance
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
compressed_dmat = distance.squareform(tr.paired_tcrdist, force = "vector")
Z = linkage(compressed_dmat, method = "complete")
den = dendrogram(Z, color_threshold = np.inf, no_plot = True)
cluster_index = fcluster(Z, t = 20, criterion = "maxclust")
assert len(cluster_index) == tr.clone_df.shape[0]
assert len(cluster_index) == tr.paired_tcrdist.shape[0]
tr.clone_df['cluster_index'] = cluster_index
# Subset to Cluster 5
criteria = (cluster_index == 5)
clone_df_subset = tr.clone_df[criteria]
clone_df_subset = clone_df_subset[clone_df_subset.epitope == "PA"].copy()
dist_a_subset = tr.dist_a.loc[clone_df_subset.clone_id, clone_df_subset.clone_id].copy()
dist_b_subset = tr.dist_b.loc[clone_df_subset.clone_id, clone_df_subset.clone_id].copy()
clone_df_subset = populate_legacy_fields(df = clone_df_subset, chains =['alpha', 'beta'])
ts = TCRsubset(clone_df_subset,
organism = "mouse",
epitopes = ["PA"] ,
epitope = "PA",
chains = ["A","B"],
dist_a = dist_a_subset,
dist_b = dist_b_subset)
# Find Motifs
if os.path.isfile("dash_PA_cluster_5_motifs.csv"):
ts.motif_df = pd.read_csv("dash_PA_cluster_5_motifs.csv")
else:
motif_df = ts.find_motif()
# Save Motifs
ts.motif_df.to_csv("dash_PA_cluster_5_motifs.csv", index = False)
# Preprocess Motifs
motif_list_a = list()
motif_logos_a = list()
for i,row in ts.motif_df[ts.motif_df.ab == "A"].iterrows():
StoreIOMotif_instance = ts.eval_motif(row)
motif_list_a.append(StoreIOMotif_instance)
motif_logos_a.append(plotting.plot_pwm(StoreIOMotif_instance, create_file = False, my_height = 200, my_width = 600))
motif_list_b = list()
motif_logos_b = list()
for i,row in ts.motif_df[ts.motif_df.ab == "B"].iterrows():
StoreIOMotif_instance = ts.eval_motif(row)
motif_list_b.append(StoreIOMotif_instance)
motif_logos_b.append(plotting.plot_pwm(StoreIOMotif_instance, create_file = False, my_height = 200, my_width = 600))