Skip to content

Commit e2d3633

Browse files
committed
fix error when sample IDs are numbers
1 parent 91d87a4 commit e2d3633

3 files changed

Lines changed: 29 additions & 9 deletions

File tree

pyjacker/breakpoints.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ def read_breakpoints(df_breakpoints):
2626
if "orientation1" in df_breakpoints.columns:
2727
orientation1 = df_breakpoints.loc[i,"orientation1"]
2828
orientation2 = df_breakpoints.loc[i,"orientation2"]
29+
else:
30+
orientation1="+"
31+
orientation2="+"
2932

3033
#if chr1==chr2 and abs(pos1-pos2)<5000: continue # Filter out small SVs ?
3134

pyjacker/pyjacker.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,12 +99,29 @@ def __init__(self,config_file):
9999
self.df_breakpoints = None
100100
if "breakpoints" in data_yaml:
101101
self.df_breakpoints = pd.read_csv(data_yaml["breakpoints"],sep="\t")
102+
required_cols = {'sample','chr1','chr2','pos1','pos2'}
103+
if not required_cols.issubset(self.df_breakpoints.columns):
104+
raise Exception("The breakpoints file must contain at least the following columns: sample, chr1, pos1, chr2, pos2.")
105+
self.df_breakpoints['sample'] = self.df_breakpoints['sample'].astype(str)
106+
self.df_breakpoints['chr1'] = self.df_breakpoints['chr1'].astype(str)
107+
self.df_breakpoints['chr2'] = self.df_breakpoints['chr2'].astype(str)
108+
102109
# Remove breakpoints corresponding to samples for which we have no expression.
103110
selected_indices=[]
111+
unselected_samples= set()
104112
for x in self.df_breakpoints.index:
105113
if self.df_breakpoints.loc[x,"sample"] in self.samples: selected_indices.append(x)
114+
else: unselected_samples.add(self.df_breakpoints.loc[x,"sample"])
115+
if len(selected_indices)==0 and self.df_breakpoints.shape[0]>0:
116+
raise Exception("In the breakpoints file, the sample column does not match any sample provided in the gene expression file.")
117+
if len(unselected_samples)>0:
118+
print("Warning: some samples were listed in the breakpoints file but not present in the gene expression file: "+ ",".join(unselected_samples))
106119
self.df_breakpoints = self.df_breakpoints.loc[selected_indices,:].reset_index(drop=True)
107120

121+
if len(self.df_breakpoints)>200000:
122+
print("Warning: you provided "+str(len(self.df_breakpoints))+" breakpoints, which is more than pyjacker is designed to handle. Pyjacker might take a while to complete. Please make sure that you only provided somatic SVs and filtered germline ones.")
123+
124+
108125
if len(self.samples)<3: sys.exit("Error: Insufficient number of samples in the RNA TPM file. Pyjacker is designed to run with at least 10 samples, and cannot be run with fewer than 3.")
109126
print("Running pyjacker on {} samples.".format(len(self.samples)))
110127
if len(self.samples)<10:
@@ -116,16 +133,15 @@ def __init__(self,config_file):
116133
self.CNAs = read_CNAs(df_CNAs,self.samples)
117134
self.df_breakpoints = add_cna_breakpoints(self.CNAs,self.df_breakpoints,self.chr_lengths)
118135
TPM_corrected_file = os.path.join(self.cache_dir,"TPM_corrected.tsv")
119-
#if os.path.exists(TPM_corrected_file):
120-
# self.df_TPM = pd.read_csv(TPM_corrected_file,sep="\t",index_col=0)
121-
#else:
122136
print("Computing gene expression values corrected for copy number...")
123137
self.df_TPM = correct_exp_cn(self.df_TPM,self.CNAs,self.genes,chr_lengths=self.chr_lengths)
124138
self.df_TPM.to_csv(TPM_corrected_file,sep="\t")
125139

126140
else: self.CNAs = None
127141
self.breakpoints = read_breakpoints(self.df_breakpoints)
128-
142+
if len(self.breakpoints)==0:
143+
raise Exception("No breakpoints were provided.")
144+
129145
# Filter genes: the gene must be expressed in at least one sample.
130146
# Also require at least 30% of samples to have expression lower than max/10.
131147
# This reduces the number of tests.
@@ -146,9 +162,6 @@ def __init__(self,config_file):
146162
self.ase_dna_dir = None
147163
cache_ase_file = os.path.join(self.cache_dir,"ase.tsv")
148164
cache_SNPs_file = os.path.join(self.cache_dir,"SNPs.tsv")
149-
#if os.path.exists(cache_ase_file) and os.path.exists(cache_SNPs_file):
150-
# self.df_ase = pd.read_csv((cache_ase_file),sep="\t",index_col=0)
151-
# self.df_n_SNPs=pd.read_csv((cache_SNPs_file),sep="\t",index_col=0)
152165
print("Computing allele-specific expression scores...")
153166
imprinted_genes_file = None
154167
if "imprinted_genes_file" in data_yaml:
@@ -209,7 +222,6 @@ def init_worker():
209222
df_result = pd.DataFrame(results_flattened)
210223
if not df_result.empty:
211224
df_result = df_result.sort_values("score",ascending=False)
212-
#print(df_result)
213225
return df_result
214226

215227

@@ -228,6 +240,8 @@ def compute_empirical_pval(score,null_distribution):
228240
return np.sum([score<x for x in null_distribution]) / len(null_distribution)
229241
if group_by_gene:
230242
pvals = [compute_empirical_pval(score,null_distribution) for score in gene_scores]
243+
if len(pvals)==0:
244+
raise Exception("No putative enhancer hijacking event was identified.")
231245
qvals = multipletests(pvals,alpha=0.05,method="fdr_bh")[1]
232246
gene2qval={}
233247
for i in range(len(gene_ids)):
@@ -373,6 +387,9 @@ def main():
373387
pyjack = pyjacker(args.config_file)
374388
print("Searching for enhancer hijacking events...")
375389
df_result=pyjack.find_EH(random_candidates=False)
390+
if df_result.shape[0]==0:
391+
print("No putative enhancer hijacking event could be identified.")
392+
sys.exit(0)
376393
null_distribution = pyjack.estimate_null_distribution()
377394
pyjack.save_results(df_result,null_distribution=null_distribution)
378395

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ packages = ["pyjacker"]
77

88
[project]
99
name = 'pyjacker'
10-
version = "1.1.4"
10+
version = "1.1.5"
1111
description = 'Enhancer hijacking detection from WGS and RNAseq.'
1212
readme = 'README.md'
1313
authors = [

0 commit comments

Comments
 (0)