Source code for utils.SCTransform
import statsmodels.nonparametric.kernel_regression
from KDEpy import FFTKDE
from multiprocessing import Pool, Manager
from scipy import stats
import numpy as np
import os
import pandas as pd
import statsmodels.discrete.discrete_model
from anndata import AnnData
import scipy as sp
_EPS = np.finfo(float).eps
[docs]
def robust_scale_binned(y, x, breaks):
bins = np.digitize(x,breaks)
binsu = np.unique(bins)
res = np.zeros(bins.size)
for i in range(binsu.size):
yb = y[bins==binsu[i]]
res[bins==binsu[i]] = (yb - np.median(yb)) / (1.4826 * np.median(np.abs(yb - np.median(yb)))+ _EPS)
return res
[docs]
def is_outlier(y, x, th = 10):
z = FFTKDE(kernel='gaussian', bw='ISJ').fit(x)
z.evaluate();
bin_width = (max(x) - min(x)) * z.bw / 2
eps = _EPS * 10
breaks1 = np.arange(min(x),max(x)+ bin_width,bin_width)
breaks2 = np.arange(min(x) - eps - bin_width/2,max(x)+bin_width,bin_width)
score1 = robust_scale_binned(y, x, breaks1)
score2 = robust_scale_binned(y, x, breaks2)
return np.abs(np.vstack((score1,score2))).min(0)>th
def _parallel_init(igenes_bin_regress,iumi_bin,ign,imm,ips):
global genes_bin_regress
global umi_bin
global gn
global mm
global ps
genes_bin_regress = igenes_bin_regress
umi_bin = iumi_bin
gn = ign
mm = imm
ps = ips
def _parallel_wrapper(j):
name = gn[genes_bin_regress[j]]
y = umi_bin[:,j].A.flatten()
pr = statsmodels.discrete.discrete_model.Poisson(y,mm)
res = pr.fit(disp=False)
mu = res.predict()
theta = theta_ml(y,mu)
ps[name] = np.append(res.params,theta)
[docs]
def gmean(X,axis=0,eps=1):
X=X.copy()
X.data[:] = np.log(X.data+eps)
return np.exp(X.mean(axis).A.flatten())-eps
[docs]
def theta_ml(y,mu):
n = y.size
weights = np.ones(n)
limit = 10
eps = (_EPS)**0.25
from scipy.special import psi, polygamma
def score(n,th,mu,y,w):
return sum(w*(psi(th + y) - psi(th) + np.log(th) + 1 - np.log(th + mu) - (y + th)/(mu + th)))
def info(n,th,mu,y,w):
return sum(w*( - polygamma(1,th + y) + polygamma(1,th) - 1/th + 2/(mu + th) - (y + th)/(mu + th)**2))
t0 = n/sum(weights*(y/mu - 1)**2)
it = 0
de = 1
while(it + 1 < limit and abs(de) > eps):
it+=1
t0 = abs(t0)
i = info(n, t0, mu, y, weights)
de = score(n, t0, mu, y, weights)/i
t0 += de
t0 = max(t0,0)
return t0
[docs]
def SCTransform(adata,min_cells=5,gmean_eps=1,n_genes=2000,n_cells=None,bin_size=500,bw_adjust=3,inplace=True):
"""
This is a port of SCTransform from the Satija lab. See the R package for original documentation.
Currently, only regression against the log UMI counts are supported.
The only significant modification is that negative Pearson residuals are zero'd out to preserve
the sparsity structure of the data.
"""
X=adata.X.copy()
X=sp.sparse.csr_matrix(X)
X.eliminate_zeros();
gn = np.array(list(adata.var_names))
cn = np.array(list(adata.obs_names))
genes_cell_count = X.sum(0).A.flatten()
genes = np.where(genes_cell_count >= min_cells)[0]
genes_ix=genes.copy()
X = X[:,genes]
Xraw=X.copy()
gn = gn[genes]
genes = np.arange(X.shape[1])
genes_cell_count = X.sum(0).A.flatten()
genes_log_gmean = np.log10(gmean(X,axis=0,eps=gmean_eps))
if n_cells is not None and n_cells < X.shape[0]:
cells_step1 = np.sort(np.random.choice(X.shape[0],replace=False,size=n_cells))
genes_cell_count_step1 = X[cells_step1].sum(0).A.flatten()
genes_step1 = np.where(genes_cell_count_step1 >= min_cells)[0]
genes_log_gmean_step1 = np.log10(gmean(X[cells_step1][:,genes_step1],axis=0,eps=gmean_eps))
else:
cells_step1 = np.arange(X.shape[0])
genes_step1 = genes
genes_log_gmean_step1 = genes_log_gmean
umi = X.sum(1).A.flatten()
log_umi = np.log10(umi)
X2=X.copy()
X2.data[:]=1
gene = X2.sum(1).A.flatten()
log_gene = np.log10(gene)
umi_per_gene = umi / gene
log_umi_per_gene = np.log10(umi_per_gene)
cell_attrs = pd.DataFrame(index = cn, data = np.vstack((umi,log_umi,gene,log_gene,umi_per_gene,log_umi_per_gene)).T,
columns=['umi','log_umi','gene','log_gene','umi_per_gene','log_umi_per_gene'])
data_step1 = cell_attrs.iloc[cells_step1]
if n_genes is not None and n_genes < len(genes_step1):
log_gmean_dens = stats.gaussian_kde(genes_log_gmean_step1,bw_method='scott')
xlo = np.linspace(genes_log_gmean_step1.min(),genes_log_gmean_step1.max(),512)
ylo = log_gmean_dens.evaluate(xlo)
xolo = genes_log_gmean_step1
sampling_prob = 1 / (np.interp(xolo,xlo,ylo) + _EPS)
genes_step1 = np.sort(np.random.choice(genes_step1,size=n_genes,p=sampling_prob/sampling_prob.sum(),replace=False))
genes_log_gmean_step1 = np.log10(gmean(X[cells_step1,:][:,genes_step1],eps=gmean_eps))
bin_ind = np.ceil(np.arange(1,genes_step1.size+1) / bin_size)
max_bin = max(bin_ind)
ps = Manager().dict()
for i in range(1,int(max_bin)+1):
genes_bin_regress = genes_step1[bin_ind==i]
umi_bin = X[cells_step1,:][:,genes_bin_regress]
mm = np.vstack((np.ones(data_step1.shape[0]),data_step1['log_umi'].values.flatten())).T
pc_chunksize = umi_bin.shape[1] // os.cpu_count() + 1
pool = Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gn, mm, ps])
try:
pool.map(_parallel_wrapper, range(umi_bin.shape[1]), chunksize=pc_chunksize)
finally:
pool.close()
pool.join()
ps = ps._getvalue()
model_pars = pd.DataFrame(data = np.vstack([ps[x] for x in gn[genes_step1]]),
columns = ['Intercept','log_umi','theta'],
index = gn[genes_step1])
min_theta = 1e-7
x = model_pars['theta'].values.copy()
x[x<min_theta]=min_theta
model_pars['theta']=x
dispersion_par = np.log10(1 + 10**genes_log_gmean_step1 / model_pars['theta'].values.flatten())
model_pars = model_pars.iloc[:,model_pars.columns!='theta'].copy()
model_pars['dispersion']=dispersion_par
outliers = np.vstack(([is_outlier(model_pars.values[:,i],genes_log_gmean_step1) for i in range(model_pars.shape[1])])).sum(0)>0
filt = np.invert(outliers)
model_pars = model_pars[filt]
genes_step1 = genes_step1[filt]
genes_log_gmean_step1 = genes_log_gmean_step1[filt]
z = FFTKDE(kernel='gaussian', bw='ISJ').fit(genes_log_gmean_step1)
z.evaluate();
bw = z.bw*bw_adjust
x_points = np.vstack((genes_log_gmean,np.array([min(genes_log_gmean_step1)]*genes_log_gmean.size))).max(0)
x_points = np.vstack((x_points,np.array([max(genes_log_gmean_step1)]*genes_log_gmean.size))).min(0)
full_model_pars = pd.DataFrame(data = np.zeros((x_points.size,model_pars.shape[1])),index=gn,columns=model_pars.columns)
for i in model_pars.columns:
kr = statsmodels.nonparametric.kernel_regression.KernelReg(model_pars[i].values,genes_log_gmean_step1[:,None],['c'],reg_type='ll',bw=[bw])
full_model_pars[i] = kr.fit(data_predict=x_points)[0]
theta = 10**genes_log_gmean / (10**full_model_pars['dispersion'].values - 1)
full_model_pars['theta'] = theta
del full_model_pars['dispersion']
model_pars_outliers = outliers
regressor_data = np.vstack((np.ones(cell_attrs.shape[0]),cell_attrs['log_umi'].values)).T
d = X.data
x,y = X.nonzero()
mud = np.exp(full_model_pars.values[:,0][y] + full_model_pars.values[:,1][y] * cell_attrs['log_umi'].values[x])
vard = mud+mud**2 / full_model_pars['theta'].values.flatten()[y]
X.data[:] = (d - mud) / vard**0.5
X.data[X.data<0]=0
X.eliminate_zeros()
clip = np.sqrt(X.shape[0]/30)
X.data[X.data>clip]=clip
if inplace:
adata.raw = adata.copy()
d = dict(zip(np.arange(X.shape[1]),genes_ix))
x,y = X.nonzero()
y = np.array([d[i] for i in y])
data = X.data
Xnew = sp.sparse.coo_matrix((data, (x, y)), shape=adata.shape).tocsr()
adata.X = Xnew # TODO: add log1p of corrected umi counts to layers
for c in full_model_pars.columns:
adata.var[c+'_sct'] = full_model_pars[c]
for c in cell_attrs.columns:
adata.obs[c+'_sct'] = cell_attrs[c]
for c in model_pars.columns:
adata.var[c+'_step1_sct'] = model_pars[c]
z = pd.Series(index=gn,data=np.zeros(gn.size,dtype='int'))
z[gn[genes_step1]]=1
w = pd.Series(index=gn,data=np.zeros(gn.size,dtype='int'))
w[gn]=genes_log_gmean
adata.var['genes_step1_sct'] = z
adata.var['log10_gmean_sct'] = w
else:
adata_new = AnnData(X=X)
adata_new.var_names = pd.Index(gn)
adata_new.obs_names = adata.obs_names
adata_new.raw = adata.copy()
for c in full_model_pars.columns:
adata_new.var[c+'_sct'] = full_model_pars[c]
for c in cell_attrs.columns:
adata_new.obs[c+'_sct'] = cell_attrs[c]
for c in model_pars.columns:
adata_new.var[c+'_step1_sct'] = model_pars[c]
z = pd.Series(index=gn,data=np.zeros(gn.size,dtype='int'))
z[gn[genes_step1]]=1
adata_new.var['genes_step1_sct'] = z
adata_new.var['log10_gmean_sct'] = genes_log_gmean
return adata_new
#if __name__ == '__main__':
# SCTransform()