1010import loompy as lp
1111from operator import attrgetter
1212from typing import Type , Sequence
13- from pyscenic .genesig import GeneSignature
13+ from pyscenic .genesig import GeneSignature , openfile
1414from pyscenic .transform import df2regulons
1515from pyscenic .utils import load_motifs , load_from_yaml , save_to_yaml
1616from pyscenic .binarization import binarize
17+ from pathlib import Path , PurePath
1718
1819
1920__all__ = ['save_matrix' , 'load_exp_matrix' , 'load_signatures' , 'save_enriched_motifs' , 'load_adjacencies' ,
@@ -74,10 +75,25 @@ def load_exp_matrix_as_loom(fname,
7475 columns = ds .ca [attribute_name_cell_id ]).T
7576
7677
77- FILE_EXTENSION2SEPARATOR = {
78- '.tsv' : '\t ' ,
79- '.csv' : ','
80- }
78+ def suffixes_to_separator (extension ):
79+ if '.csv' in extension :
80+ return ','
81+ if '.tsv' in extension :
82+ return '\t '
83+
84+
85+ def is_valid_suffix (extension , method ):
86+ assert (isinstance (extension ,list )), 'extension should be of type "list"'
87+ if method in ['grn' , 'aucell' ]:
88+ valid_extensions = ['.csv' , '.tsv' , '.loom' ]
89+ elif method == 'ctx' :
90+ valid_extensions = ['.csv' , '.tsv' ]
91+ elif method == 'ctx_yaml' :
92+ valid_extensions = ['.yaml' , '.yml' ]
93+ if len (set (extension ).intersection (valid_extensions )) > 0 :
94+ return True
95+ else :
96+ return False
8197
8298
8399def load_exp_matrix (fname : str , transpose : bool = False ,
@@ -94,12 +110,13 @@ def load_exp_matrix(fname: str, transpose: bool = False,
94110 :param return_sparse: Returns a sparse matrix when loading from loom
95111 :return: A 2-dimensional dataframe (rows = cells x columns = genes).
96112 """
97- extension = os .path .splitext (fname )[1 ].lower ()
98- if extension in FILE_EXTENSION2SEPARATOR .keys ():
99- df = pd .read_csv (fname , sep = FILE_EXTENSION2SEPARATOR [extension ], header = 0 , index_col = 0 )
100- return df .T if transpose else df
101- elif extension == '.loom' :
102- return load_exp_matrix_as_loom (fname , return_sparse , attribute_name_cell_id , attribute_name_gene )
113+ extension = PurePath (fname ).suffixes
114+ if is_valid_suffix (extension , 'grn' ):
115+ if '.loom' in extension :
116+ return load_exp_matrix_as_loom (fname , return_sparse , attribute_name_cell_id , attribute_name_gene )
117+ else :
118+ df = pd .read_csv (fname , sep = suffixes_to_separator (extension ), header = 0 , index_col = 0 )
119+ return df .T if transpose else df
103120 else :
104121 raise ValueError ("Unknown file format \" {}\" ." .format (fname ))
105122
@@ -114,19 +131,25 @@ def save_matrix(df: pd.DataFrame, fname: str, transpose: bool = False) -> None:
114131 :param fname: The name of the file to be written.
115132 :param transpose: Should the expression matrix be stored as (rows = genes x columns = cells)?
116133 """
117- extension = os .path .splitext (fname )[1 ].lower ()
118- if extension in FILE_EXTENSION2SEPARATOR .keys ():
119- (df .T if transpose else df ).to_csv (fname , sep = FILE_EXTENSION2SEPARATOR [extension ])
120- elif extension == '.loom' :
121- return save_df_as_loom (df , fname )
134+ extension = PurePath (fname ).suffixes
135+ if is_valid_suffix (extension , 'aucell' ):
136+ if '.loom' in extension :
137+ return save_df_as_loom (df , fname )
138+ else :
139+ (df .T if transpose else df ).to_csv (fname , sep = suffixes_to_separator (extension ))
122140 else :
123141 raise ValueError ("Unknown file format \" {}\" ." .format (fname ))
124142
125143
126144def guess_separator (fname : str ) -> str :
127- with open (fname , 'r' ) as f :
145+ with openfile (fname , 'r' ) as f :
128146 lines = f .readlines ()
129147
148+ # decode if gzipped file:
149+ for i ,x in enumerate (lines ):
150+ if isinstance (x , (bytes , bytearray )):
151+ lines [i ] = x .decode ()
152+
130153 def count_columns (sep ):
131154 return [len (line .split (sep )) for line in lines if not line .strip ().startswith ('#' ) and line .strip ()]
132155
@@ -146,18 +169,19 @@ def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
146169 :param fname: The name of the file that contains the signatures.
147170 :return: A list of gene signatures.
148171 """
149- extension = os .path .splitext (fname )[1 ].lower ()
150- if extension in FILE_EXTENSION2SEPARATOR .keys ():
151- return df2regulons (load_motifs (fname , sep = FILE_EXTENSION2SEPARATOR [extension ]))
152- elif extension in {'.yaml' , '.yml' }:
172+ extension = PurePath (fname ).suffixes
173+ if is_valid_suffix (extension , 'ctx' ):
174+ # csv/tsv
175+ return df2regulons (load_motifs (fname , sep = suffixes_to_separator (extension )))
176+ elif is_valid_suffix (extension , 'ctx_yaml' ):
153177 return load_from_yaml (fname )
154- elif extension . endswith ( '.gmt' ) :
178+ elif '.gmt' in extension :
155179 sep = guess_separator (fname )
156180 return GeneSignature .from_gmt (fname ,
157181 field_separator = sep ,
158182 gene_separator = sep )
159183 elif extension == '.dat' :
160- with open (fname , 'rb' ) as f :
184+ with openfile (fname , 'rb' ) as f :
161185 return pickle .load (f )
162186 else :
163187 raise ValueError ("Unknown file format \" {}\" ." .format (fname ))
@@ -173,42 +197,43 @@ def save_enriched_motifs(df, fname:str) -> None:
173197 :param fname:
174198 :return:
175199 """
176- extension = os . path . splitext (fname )[ 1 ]. lower ()
177- if extension in FILE_EXTENSION2SEPARATOR . keys ( ):
178- df .to_csv (fname , sep = FILE_EXTENSION2SEPARATOR [ extension ] )
200+ extension = PurePath (fname ). suffixes
201+ if is_valid_suffix ( extension , 'ctx' ):
202+ df .to_csv (fname , sep = suffixes_to_separator ( extension ) )
179203 else :
180204 regulons = df2regulons (df )
181- if extension == '.json' :
205+ if '.json' in extension :
182206 name2targets = {r .name : list (r .gene2weight .keys ()) for r in regulons }
183- with open (fname , 'w' ) as f :
207+ with openfile (fname , 'w' ) as f :
184208 f .write (json .dumps (name2targets ))
185- elif extension == '.dat' :
186- with open (fname , 'wb' ) as f :
209+ elif '.dat' in extension :
210+ with openfile (fname , 'wb' ) as f :
187211 pickle .dump (regulons , f )
188- elif extension == '.gmt' :
212+ elif '.gmt' in extension :
189213 GeneSignature .to_gmt (fname , regulons )
190- elif extension in { '.yaml' , '.yml' } :
214+ elif is_valid_suffix ( extension , 'ctx_yaml' ) :
191215 save_to_yaml (regulons , fname )
192216 else :
193217 raise ValueError ("Unknown file format \" {}\" ." .format (fname ))
194218
195219
196220def load_adjacencies (fname : str ) -> pd .DataFrame :
197- extension = os . path . splitext (fname )[ 1 ]. lower (). lower ()
198- return pd .read_csv (fname , sep = FILE_EXTENSION2SEPARATOR [ extension ] , dtype = {0 :str ,1 :str ,2 :np .float64 }, keep_default_na = False )
221+ extension = PurePath (fname ). suffixes
222+ return pd .read_csv (fname , sep = suffixes_to_separator ( extension ) , dtype = {0 :str ,1 :str ,2 :np .float64 }, keep_default_na = False )
199223
200224
201225def load_modules (fname : str ) -> Sequence [Type [GeneSignature ]]:
202226 # Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
203227 # Potential improvements are switching to JSON or to use a CLoader:
204228 # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
205229 # The alternative for which was opted in the end is binary pickling.
206- if fname .endswith ('.yaml' ) or fname .endswith ('.yml' ):
230+ extension = PurePath (fname ).suffixes
231+ if is_valid_suffix (extension , 'ctx_yaml' ):
207232 return load_from_yaml (fname )
208- elif fname . endswith ( '.dat' ) :
209- with open (fname , 'rb' ) as f :
233+ elif '.dat' in extension :
234+ with openfile (fname , 'rb' ) as f :
210235 return pickle .load (f )
211- elif fname . endswith ( '.gmt' ) :
236+ elif '.gmt' in extension :
212237 sep = guess_separator (fname )
213238 return GeneSignature .from_gmt (fname ,
214239 field_separator = sep ,
0 commit comments