@@ -99,12 +99,29 @@ def __init__(self,config_file):
9999 self .df_breakpoints = None
100100 if "breakpoints" in data_yaml :
101101 self .df_breakpoints = pd .read_csv (data_yaml ["breakpoints" ],sep = "\t " )
102+ required_cols = {'sample' ,'chr1' ,'chr2' ,'pos1' ,'pos2' }
103+ if not required_cols .issubset (self .df_breakpoints .columns ):
104+ raise Exception ("The breakpoints file must contain at least the following columns: sample, chr1, pos1, chr2, pos2." )
105+ self .df_breakpoints ['sample' ] = self .df_breakpoints ['sample' ].astype (str )
106+ self .df_breakpoints ['chr1' ] = self .df_breakpoints ['chr1' ].astype (str )
107+ self .df_breakpoints ['chr2' ] = self .df_breakpoints ['chr2' ].astype (str )
108+
102109 # Remove breakpoints corresponding to samples for which we have no expression.
103110 selected_indices = []
111+ unselected_samples = set ()
104112 for x in self .df_breakpoints .index :
105113 if self .df_breakpoints .loc [x ,"sample" ] in self .samples : selected_indices .append (x )
114+ else : unselected_samples .add (self .df_breakpoints .loc [x ,"sample" ])
115+ if len (selected_indices )== 0 and self .df_breakpoints .shape [0 ]> 0 :
116+ raise Exception ("In the breakpoints file, the sample column does not match any sample provided in the gene expression file." )
117+ if len (unselected_samples )> 0 :
118+ print ("Warning: some samples were listed in the breakpoints file but not present in the gene expression file: " + "," .join (unselected_samples ))
106119 self .df_breakpoints = self .df_breakpoints .loc [selected_indices ,:].reset_index (drop = True )
107120
121+ if len (self .df_breakpoints )> 200000 :
122+ print ("Warning: you provided " + str (len (self .df_breakpoints ))+ " breakpoints, which is more than pyjacker is designed to handle. Pyjacker might take a while to complete. Please make sure that you only provided somatic SVs and filtered germline ones." )
123+
124+
108125 if len (self .samples )< 3 : sys .exit ("Error: Insufficient number of samples in the RNA TPM file. Pyjacker is designed to run with at least 10 samples, and cannot be run with fewer than 3." )
109126 print ("Running pyjacker on {} samples." .format (len (self .samples )))
110127 if len (self .samples )< 10 :
@@ -116,16 +133,15 @@ def __init__(self,config_file):
116133 self .CNAs = read_CNAs (df_CNAs ,self .samples )
117134 self .df_breakpoints = add_cna_breakpoints (self .CNAs ,self .df_breakpoints ,self .chr_lengths )
118135 TPM_corrected_file = os .path .join (self .cache_dir ,"TPM_corrected.tsv" )
119- #if os.path.exists(TPM_corrected_file):
120- # self.df_TPM = pd.read_csv(TPM_corrected_file,sep="\t",index_col=0)
121- #else:
122136 print ("Computing gene expression values corrected for copy number..." )
123137 self .df_TPM = correct_exp_cn (self .df_TPM ,self .CNAs ,self .genes ,chr_lengths = self .chr_lengths )
124138 self .df_TPM .to_csv (TPM_corrected_file ,sep = "\t " )
125139
126140 else : self .CNAs = None
127141 self .breakpoints = read_breakpoints (self .df_breakpoints )
128-
142+ if len (self .breakpoints )== 0 :
143+ raise Exception ("No breakpoints were provided." )
144+
129145 # Filter genes: the gene must be expressed in at least one sample.
130146 # Also require at least 30% of samples to have expression lower than max/10.
131147 # This reduces the number of tests.
@@ -146,9 +162,6 @@ def __init__(self,config_file):
146162 self .ase_dna_dir = None
147163 cache_ase_file = os .path .join (self .cache_dir ,"ase.tsv" )
148164 cache_SNPs_file = os .path .join (self .cache_dir ,"SNPs.tsv" )
149- #if os.path.exists(cache_ase_file) and os.path.exists(cache_SNPs_file):
150- # self.df_ase = pd.read_csv((cache_ase_file),sep="\t",index_col=0)
151- # self.df_n_SNPs=pd.read_csv((cache_SNPs_file),sep="\t",index_col=0)
152165 print ("Computing allele-specific expression scores..." )
153166 imprinted_genes_file = None
154167 if "imprinted_genes_file" in data_yaml :
@@ -209,7 +222,6 @@ def init_worker():
209222 df_result = pd .DataFrame (results_flattened )
210223 if not df_result .empty :
211224 df_result = df_result .sort_values ("score" ,ascending = False )
212- #print(df_result)
213225 return df_result
214226
215227
@@ -228,6 +240,8 @@ def compute_empirical_pval(score,null_distribution):
228240 return np .sum ([score < x for x in null_distribution ]) / len (null_distribution )
229241 if group_by_gene :
230242 pvals = [compute_empirical_pval (score ,null_distribution ) for score in gene_scores ]
243+ if len (pvals )== 0 :
244+ raise Exception ("No putative enhancer hijacking event was identified." )
231245 qvals = multipletests (pvals ,alpha = 0.05 ,method = "fdr_bh" )[1 ]
232246 gene2qval = {}
233247 for i in range (len (gene_ids )):
@@ -373,6 +387,9 @@ def main():
373387 pyjack = pyjacker (args .config_file )
374388 print ("Searching for enhancer hijacking events..." )
375389 df_result = pyjack .find_EH (random_candidates = False )
390+ if df_result .shape [0 ]== 0 :
391+ print ("No putative enhancer hijacking event could be identified." )
392+ sys .exit (0 )
376393 null_distribution = pyjack .estimate_null_distribution ()
377394 pyjack .save_results (df_result ,null_distribution = null_distribution )
378395
0 commit comments