Skip to content

Commit 0ef0119

Browse files
authored
Merge pull request #171 from aertslab/cli_file_io
CLI: Add gzip support for intermediate files
2 parents d3120af + 55ba7d0 commit 0ef0119

File tree

5 files changed

+95
-61
lines changed

5 files changed

+95
-61
lines changed

scripts/arboreto_with_multiprocessing.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,17 @@
2222
parser_grn = argparse.ArgumentParser(description='Run Arboreto using a multiprocessing pool')
2323

2424
parser_grn.add_argument('expression_mtx_fname',
25-
type=argparse.FileType('r'),
25+
type=str,
2626
help='The name of the file that contains the expression matrix for the single cell experiment.'
2727
' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).')
2828
parser_grn.add_argument('tfs_fname',
29-
type=argparse.FileType('r'),
29+
type=str,
3030
help='The name of the file that contains the list of transcription factors (TXT; one TF per line).')
3131
parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'],
3232
default='grnboost2',
3333
help='The algorithm for gene regulatory network reconstruction (default: grnboost2).')
3434
parser_grn.add_argument('-o', '--output',
35-
type=argparse.FileType('w'), default=sys.stdout,
35+
type=str, default=sys.stdout,
3636
help='Output file/stream, i.e. a table of TF-target genes (TSV).')
3737
parser_grn.add_argument('--num_workers',
3838
type=int, default=cpu_count(),
@@ -90,7 +90,7 @@ def run_infer_partial_network(target_gene_index):
9090
if __name__ == '__main__':
9191

9292
start_time = time.time()
93-
ex_matrix = load_exp_matrix(args.expression_mtx_fname.name,
93+
ex_matrix = load_exp_matrix(args.expression_mtx_fname,
9494
(args.transpose == 'yes'),
9595
args.sparse,
9696
args.cell_id_attribute,
@@ -105,7 +105,7 @@ def run_infer_partial_network(target_gene_index):
105105
end_time = time.time()
106106
print(f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout)
107107

108-
tf_names = load_tf_names(args.tfs_fname.name)
108+
tf_names = load_tf_names(args.tfs_fname)
109109
print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout)
110110

111111
ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names)
@@ -126,5 +126,5 @@ def run_infer_partial_network(target_gene_index):
126126
end_time = time.time()
127127
print(f'Done in {end_time - start_time} seconds.', file=sys.stdout)
128128

129-
adj.to_csv(args.output, index=False, sep="\t")
129+
adj.to_csv(args.output, index=False, sep='\t')
130130

src/pyscenic/cli/pyscenic.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import sys
2525
from typing import Type, Sequence
2626
from .utils import load_exp_matrix, load_signatures, save_matrix, save_enriched_motifs, load_adjacencies, load_modules, append_auc_mtx, ATTRIBUTE_NAME_CELL_IDENTIFIER, ATTRIBUTE_NAME_GENE
27+
from .utils import is_valid_suffix, suffixes_to_separator
28+
from pathlib import Path, PurePath
2729

2830
try:
2931
from pyscenic._version import get_versions
@@ -75,9 +77,8 @@ def find_adjacencies_command(args):
7577

7678
LOGGER.info("Writing results to file.")
7779

78-
extension = os.path.splitext(args.output.name)[1].lower()
79-
separator = '\t' if extension == '.tsv' else ','
80-
network.to_csv(args.output, index=False, sep=separator)
80+
extension = PurePath(fname).suffixes
81+
network.to_csv(args.output, index=False, sep=suffixes_to_separator(extension))
8182

8283

8384
def adjacencies2modules(args):
@@ -130,8 +131,8 @@ def prune_targets_command(args):
130131
# Potential improvements are switching to JSON or to use a CLoader:
131132
# https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
132133
# The alternative for which was opted in the end is binary pickling.
133-
extension = os.path.splitext(args.module_fname.name)[1].lower()
134-
if extension in {'.csv', '.tsv'}:
134+
extension = PurePath(args.module_fname.name).suffixes
135+
if is_valid_suffix(extension, 'ctx'):
135136
if args.expression_mtx_fname is None:
136137
LOGGER.error("No expression matrix is supplied.")
137138
sys.exit(0)
@@ -201,8 +202,8 @@ def aucell_command(args):
201202
num_workers=args.num_workers)
202203

203204
LOGGER.info("Writing results to file.")
204-
extension = os.path.splitext(args.output.name)[1].lower()
205-
if extension == '.loom':
205+
extension = PurePath(args.output.name).suffixes
206+
if '.loom' in extension:
206207
try:
207208
copyfile(args.expression_mtx_fname.name, args.output.name)
208209
append_auc_mtx(args.output.name, auc_mtx, signatures, args.seed, args.num_workers)

src/pyscenic/cli/utils.py

Lines changed: 63 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
import loompy as lp
1111
from operator import attrgetter
1212
from typing import Type, Sequence
13-
from pyscenic.genesig import GeneSignature
13+
from pyscenic.genesig import GeneSignature, openfile
1414
from pyscenic.transform import df2regulons
1515
from pyscenic.utils import load_motifs, load_from_yaml, save_to_yaml
1616
from pyscenic.binarization import binarize
17+
from pathlib import Path, PurePath
1718

1819

1920
__all__ = ['save_matrix', 'load_exp_matrix', 'load_signatures', 'save_enriched_motifs', 'load_adjacencies',
@@ -74,10 +75,25 @@ def load_exp_matrix_as_loom(fname,
7475
columns=ds.ca[attribute_name_cell_id]).T
7576

7677

77-
FILE_EXTENSION2SEPARATOR = {
78-
'.tsv': '\t',
79-
'.csv': ','
80-
}
78+
def suffixes_to_separator(extension):
79+
if '.csv' in extension:
80+
return ','
81+
if '.tsv' in extension:
82+
return '\t'
83+
84+
85+
def is_valid_suffix(extension, method):
86+
assert(isinstance(extension,list)), 'extension should be of type "list"'
87+
if method in ['grn', 'aucell']:
88+
valid_extensions = ['.csv', '.tsv', '.loom']
89+
elif method == 'ctx':
90+
valid_extensions = ['.csv', '.tsv']
91+
elif method == 'ctx_yaml':
92+
valid_extensions = ['.yaml', '.yml']
93+
if len(set(extension).intersection(valid_extensions)) > 0:
94+
return True
95+
else:
96+
return False
8197

8298

8399
def load_exp_matrix(fname: str, transpose: bool = False,
@@ -94,12 +110,13 @@ def load_exp_matrix(fname: str, transpose: bool = False,
94110
:param return_sparse: Returns a sparse matrix when loading from loom
95111
:return: A 2-dimensional dataframe (rows = cells x columns = genes).
96112
"""
97-
extension = os.path.splitext(fname)[1].lower()
98-
if extension in FILE_EXTENSION2SEPARATOR.keys():
99-
df = pd.read_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension], header=0, index_col=0)
100-
return df.T if transpose else df
101-
elif extension == '.loom':
102-
return load_exp_matrix_as_loom(fname, return_sparse, attribute_name_cell_id, attribute_name_gene)
113+
extension = PurePath(fname).suffixes
114+
if is_valid_suffix(extension, 'grn'):
115+
if '.loom' in extension:
116+
return load_exp_matrix_as_loom(fname, return_sparse, attribute_name_cell_id, attribute_name_gene)
117+
else:
118+
df = pd.read_csv(fname, sep=suffixes_to_separator(extension), header=0, index_col=0)
119+
return df.T if transpose else df
103120
else:
104121
raise ValueError("Unknown file format \"{}\".".format(fname))
105122

@@ -114,19 +131,25 @@ def save_matrix(df: pd.DataFrame, fname: str, transpose: bool = False) -> None:
114131
:param fname: The name of the file to be written.
115132
:param transpose: Should the expression matrix be stored as (rows = genes x columns = cells)?
116133
"""
117-
extension = os.path.splitext(fname)[1].lower()
118-
if extension in FILE_EXTENSION2SEPARATOR.keys():
119-
(df.T if transpose else df).to_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension])
120-
elif extension == '.loom':
121-
return save_df_as_loom(df, fname)
134+
extension = PurePath(fname).suffixes
135+
if is_valid_suffix(extension, 'aucell'):
136+
if '.loom' in extension:
137+
return save_df_as_loom(df, fname)
138+
else:
139+
(df.T if transpose else df).to_csv(fname, sep=suffixes_to_separator(extension))
122140
else:
123141
raise ValueError("Unknown file format \"{}\".".format(fname))
124142

125143

126144
def guess_separator(fname: str) -> str:
127-
with open(fname, 'r') as f:
145+
with openfile(fname, 'r') as f:
128146
lines = f.readlines()
129147

148+
# decode if gzipped file:
149+
for i,x in enumerate(lines):
150+
if isinstance(x, (bytes, bytearray)):
151+
lines[i] = x.decode()
152+
130153
def count_columns(sep):
131154
return [len(line.split(sep)) for line in lines if not line.strip().startswith('#') and line.strip()]
132155

@@ -146,18 +169,19 @@ def load_signatures(fname: str) -> Sequence[Type[GeneSignature]]:
146169
:param fname: The name of the file that contains the signatures.
147170
:return: A list of gene signatures.
148171
"""
149-
extension = os.path.splitext(fname)[1].lower()
150-
if extension in FILE_EXTENSION2SEPARATOR.keys():
151-
return df2regulons(load_motifs(fname, sep=FILE_EXTENSION2SEPARATOR[extension]))
152-
elif extension in {'.yaml', '.yml'}:
172+
extension = PurePath(fname).suffixes
173+
if is_valid_suffix(extension, 'ctx'):
174+
# csv/tsv
175+
return df2regulons(load_motifs(fname, sep=suffixes_to_separator(extension)))
176+
elif is_valid_suffix(extension, 'ctx_yaml'):
153177
return load_from_yaml(fname)
154-
elif extension.endswith('.gmt'):
178+
elif '.gmt' in extension:
155179
sep = guess_separator(fname)
156180
return GeneSignature.from_gmt(fname,
157181
field_separator=sep,
158182
gene_separator=sep)
159183
elif extension == '.dat':
160-
with open(fname, 'rb') as f:
184+
with openfile(fname, 'rb') as f:
161185
return pickle.load(f)
162186
else:
163187
raise ValueError("Unknown file format \"{}\".".format(fname))
@@ -173,42 +197,43 @@ def save_enriched_motifs(df, fname:str) -> None:
173197
:param fname:
174198
:return:
175199
"""
176-
extension = os.path.splitext(fname)[1].lower()
177-
if extension in FILE_EXTENSION2SEPARATOR.keys():
178-
df.to_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension])
200+
extension = PurePath(fname).suffixes
201+
if is_valid_suffix(extension, 'ctx'):
202+
df.to_csv(fname, sep=suffixes_to_separator(extension))
179203
else:
180204
regulons = df2regulons(df)
181-
if extension == '.json':
205+
if '.json' in extension:
182206
name2targets = {r.name: list(r.gene2weight.keys()) for r in regulons}
183-
with open(fname, 'w') as f:
207+
with openfile(fname, 'w') as f:
184208
f.write(json.dumps(name2targets))
185-
elif extension == '.dat':
186-
with open(fname, 'wb') as f:
209+
elif '.dat' in extension:
210+
with openfile(fname, 'wb') as f:
187211
pickle.dump(regulons, f)
188-
elif extension == '.gmt':
212+
elif '.gmt' in extension:
189213
GeneSignature.to_gmt(fname, regulons)
190-
elif extension in {'.yaml', '.yml'}:
214+
elif is_valid_suffix(extension, 'ctx_yaml'):
191215
save_to_yaml(regulons, fname)
192216
else:
193217
raise ValueError("Unknown file format \"{}\".".format(fname))
194218

195219

196220
def load_adjacencies(fname: str) -> pd.DataFrame:
197-
extension = os.path.splitext(fname)[1].lower().lower()
198-
return pd.read_csv(fname, sep=FILE_EXTENSION2SEPARATOR[extension], dtype={0:str,1:str,2:np.float64}, keep_default_na=False )
221+
extension = PurePath(fname).suffixes
222+
return pd.read_csv(fname, sep=suffixes_to_separator(extension), dtype={0:str,1:str,2:np.float64}, keep_default_na=False )
199223

200224

201225
def load_modules(fname: str) -> Sequence[Type[GeneSignature]]:
202226
# Loading from YAML is extremely slow. Therefore this is a potential performance improvement.
203227
# Potential improvements are switching to JSON or to use a CLoader:
204228
# https://stackoverflow.com/questions/27743711/can-i-speedup-yaml
205229
# The alternative for which was opted in the end is binary pickling.
206-
if fname.endswith('.yaml') or fname.endswith('.yml'):
230+
extension = PurePath(fname).suffixes
231+
if is_valid_suffix(extension, 'ctx_yaml'):
207232
return load_from_yaml(fname)
208-
elif fname.endswith('.dat'):
209-
with open(fname, 'rb') as f:
233+
elif '.dat' in extension:
234+
with openfile(fname, 'rb') as f:
210235
return pickle.load(f)
211-
elif fname.endswith('.gmt'):
236+
elif '.gmt' in extension:
212237
sep = guess_separator(fname)
213238
return GeneSignature.from_gmt(fname,
214239
field_separator=sep,

src/pyscenic/genesig.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
from cytoolz import merge_with, dissoc, keyfilter, first, second
1212
from frozendict import frozendict
1313
from itertools import chain
14-
14+
import gzip
1515
from cytoolz import memoize, merge
1616

1717

18+
1819
def convert(genes):
1920
# Genes supplied as dictionary.
2021
if isinstance(genes, Mapping):
@@ -27,6 +28,13 @@ def convert(genes):
2728
return frozendict(zip(genes, repeat(1.0)))
2829

2930

31+
def openfile(filename, mode='r'):
32+
if filename.endswith('.gz'):
33+
return gzip.open(filename, mode)
34+
else:
35+
return open(filename, mode)
36+
37+
3038
@attr.s(frozen=True)
3139
class GeneSignature(yaml.YAMLObject):
3240
"""
@@ -66,8 +74,10 @@ def from_gmt(cls, fname: str, field_separator: str = ',', gene_separator: str =
6674
assert os.path.exists(fname), "{} does not exist.".format(fname)
6775

6876
def signatures():
69-
with open(fname, "r") as file:
77+
with openfile(fname, "r") as file:
7078
for line in file:
79+
if isinstance(line, (bytes, bytearray)):
80+
line = line.decode()
7181
if line.startswith("#") or not line.strip():
7282
continue
7383
columns = re.split(field_separator, line.rstrip())
@@ -87,7 +97,7 @@ def to_gmt(cls, fname: str, signatures: List[Type['GeneSignature']], field_separ
8797
:param gene_separator: The separator that separates the genes.
8898
"""
8999
#assert not os.path.exists(fname), "{} already exists.".format(fname)
90-
with open(fname, "wt") as file:
100+
with openfile(fname, "wt") as file:
91101
for signature in signatures:
92102
genes = gene_separator.join(signature.genes)
93103
file.write("{}{}{}{}{}\n".format(signature.name, field_separator,
@@ -106,7 +116,7 @@ def from_grp(cls, fname, name: str) -> 'GeneSignature':
106116
"""
107117
# https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats
108118
assert os.path.exists(fname), "{} does not exist.".format(fname)
109-
with open(fname, "r") as file:
119+
with openfile(fname, "r") as file:
110120
return GeneSignature(name=name,
111121
gene2weight=[line.rstrip() for line in file if not line.startswith("#") and line.strip()])
112122

@@ -124,7 +134,7 @@ def from_rnk(cls, fname: str, name: str, field_separator=",") -> 'GeneSignature'
124134
assert os.path.exists(fname), "{} does not exist.".format(fname)
125135

126136
def columns():
127-
with open(fname, "r") as file:
137+
with openfile(fname, "r") as file:
128138
for line in file:
129139
if line.startswith("#") or not line.strip():
130140
continue

src/pyscenic/utils.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pandas as pd
44
from urllib.parse import urljoin
5-
from .genesig import Regulon, GeneSignature
5+
from .genesig import Regulon, GeneSignature, openfile
66
from .math import masked_rho4pairs
77
from itertools import chain
88
import numpy as np
@@ -292,7 +292,7 @@ def save_to_yaml(signatures: Sequence[Type[GeneSignature]], fname: str):
292292
:param signatures:
293293
:return:
294294
"""
295-
with open(fname, 'w') as f:
295+
with openfile(fname, 'w') as f:
296296
f.write(dump(signatures, default_flow_style=False, Dumper=Dumper))
297297

298298

@@ -302,7 +302,7 @@ def load_from_yaml(fname: str) -> Sequence[Type[GeneSignature]]:
302302
:param fname:
303303
:return:
304304
"""
305-
with open(fname, 'r') as f:
305+
with openfile(fname, 'r') as f:
306306
return load(f.read(), Loader=Loader)
307307

308308

@@ -335,5 +335,3 @@ def load_motifs(fname: str, sep: str = ',') -> pd.DataFrame:
335335
return df
336336

337337

338-
339-

0 commit comments

Comments
 (0)