Quickstart¶
This guide walks through the core OMOPy workflow: connecting to a database, inspecting CDM tables, generating cohorts, and enriching with patient data.
Connect to a Database¶
from omopy.connector import cdm_from_con
# DuckDB file — the simplest case
cdm = cdm_from_con("data/synthea.duckdb", cdm_schema="base")
# Or from an existing Ibis connection
import ibis
con = ibis.duckdb.connect("data/synthea.duckdb", read_only=True)
cdm = cdm_from_con(con, cdm_schema="base")
cdm_from_con auto-detects the CDM version, discovers available tables,
and returns a CdmReference — a dict-like container of lazy CdmTable objects.
Explore the CDM¶
# List available tables
print(cdm.table_names)
# ['person', 'observation_period', 'condition_occurrence', ...]
# Access a table (returns a CdmTable wrapping an Ibis expression)
person = cdm["person"]
print(person.count()) # number of rows
# Collect to a Polars DataFrame
df = person.head(5).collect()
print(df)
Take a Snapshot¶
from omopy.connector import snapshot
snap = snapshot(cdm)
print(snap.person_count)
print(snap.vocabulary_version)
print(snap.earliest_observation_period_start_date)
print(snap.latest_observation_period_end_date)
Generate a Cohort¶
From Concept Sets¶
from omopy.generics import Codelist
from omopy.connector import generate_concept_cohort_set
# Define conditions by OMOP concept IDs
codelist = Codelist({
"hypertension": [320128],
"diabetes": [201826],
})
# Generate cohorts — returns an updated CDM with a new CohortTable
cdm = generate_concept_cohort_set(
cdm,
codelist,
name="conditions",
)
# Access the cohort
cohort = cdm["conditions"]
print(cohort.count())
print(cohort.collect())
From CIRCE JSON (ATLAS Definitions)¶
from omopy.connector import generate_cohort_set
cdm = generate_cohort_set(
cdm,
"path/to/cohort_definitions/",
name="atlas_cohorts",
)
Enrich with Patient Profiles¶
from omopy.profiles import add_demographics, add_age
# Add age, sex, and observation period info
enriched = add_demographics(
cdm["conditions"],
cdm,
age=True,
sex=True,
prior_observation=True,
)
df = enriched.collect()
print(df.columns)
# [..., 'age', 'sex', 'prior_observation']
Build a Codelist from Vocabulary¶
from omopy.codelist import get_candidate_codes, get_descendants
# Search for concepts by keyword
candidates = get_candidate_codes(
cdm,
keywords=["sinusitis"],
domains=["Condition"],
standard_concept="S",
)
print(candidates)
# Expand to include descendant concepts
expanded = get_descendants(cdm, candidates)
print(expanded)
Subset the CDM¶
from omopy.connector import cdm_subset, cdm_sample
# Subset to specific persons
small_cdm = cdm_subset(cdm, person_ids=[1, 2, 3])
# Or take a random sample
sample_cdm = cdm_sample(cdm, n=10)