Skip to content

Out-of-memory Processing

emzed.Table.open() reads a table from disk on demand, so the familiar table API stays available even when the data would not fit in RAM — for example when combining peak tables from many samples in a workflow.

"""Combine filtered peak tables from many samples without loading them into RAM."""

from pathlib import Path
from tempfile import TemporaryDirectory

import emzed

with TemporaryDirectory() as tmp:
    folder = Path(tmp)

    # Pretend we already have peak tables from several samples on disk,
    # produced by earlier per-sample processing.
    sample_paths = []
    for i in range(3):
        t = emzed.Table.create_table(
            ["mz", "rt", "area"],
            [emzed.MzType, emzed.RtType, float],
            rows=[
                [150.0 + i, 30.0 + i, 1e5 * (i + 1)],
                [151.0 + i, 31.0 + i, 0.5e5 * (i + 1)],
            ],
        )
        path = folder / f"sample_{i}.table"
        t.save(path)
        sample_paths.append(path)

    # Open each sample on disk and keep only high-quality peaks.
    filtered = []
    for path in sample_paths:
        t = emzed.Table.open(path)
        # keep_view=True keeps the filter as an on-disk view;
        # no rows are loaded into RAM.
        t = t.filter(t.area > 0.8e5, keep_view=True)
        filtered.append(t)

    # Combine the filtered views into one on-disk table.
    combined = emzed.Table.stack_tables(
        filtered,
        path=folder / "combined.table",
        overwrite=True,
    )

    print("combined is on disk:", not combined.is_in_memory())
    print(combined)
combined is on disk: True
mz           rt        area
MzType       RtType    float
-----------  --------  -------------
 150.000000    0.50 m  100000.000000
 151.000000    0.52 m  200000.000000
 152.000000    0.53 m  100000.000000
 152.000000    0.53 m  300000.000000
 153.000000    0.55 m  150000.000000