Out-of-memory Processing¶
emzed.Table.open() reads a table from disk on demand, so the familiar
table API stays available even when the data would not fit in RAM — for
example when combining peak tables from many samples in a workflow.
"""Combine filtered peak tables from many samples without loading them into RAM."""
from pathlib import Path
from tempfile import TemporaryDirectory
import emzed
with TemporaryDirectory() as tmp:
folder = Path(tmp)
# Pretend we already have peak tables from several samples on disk,
# produced by earlier per-sample processing.
sample_paths = []
for i in range(3):
t = emzed.Table.create_table(
["mz", "rt", "area"],
[emzed.MzType, emzed.RtType, float],
rows=[
[150.0 + i, 30.0 + i, 1e5 * (i + 1)],
[151.0 + i, 31.0 + i, 0.5e5 * (i + 1)],
],
)
path = folder / f"sample_{i}.table"
t.save(path)
sample_paths.append(path)
# Open each sample on disk and keep only high-quality peaks.
filtered = []
for path in sample_paths:
t = emzed.Table.open(path)
# keep_view=True keeps the filter as an on-disk view;
# no rows are loaded into RAM.
t = t.filter(t.area > 0.8e5, keep_view=True)
filtered.append(t)
# Combine the filtered views into one on-disk table.
combined = emzed.Table.stack_tables(
filtered,
path=folder / "combined.table",
overwrite=True,
)
print("combined is on disk:", not combined.is_in_memory())
print(combined)
combined is on disk: True
mz rt area
MzType RtType float
----------- -------- -------------
150.000000 0.50 m 100000.000000
151.000000 0.52 m 200000.000000
152.000000 0.53 m 100000.000000
152.000000 0.53 m 300000.000000
153.000000 0.55 m 150000.000000