Initial commit

2026-03-05 01:20:15 +01:00
commit c0c50e56f0
364 changed files with 62207 additions and 0 deletions
--- a/omop/scripts/init.py
+++ b/omop/scripts/init.py
@@ -0,0 +1 @@
+"""Utility scripts for OMOP pipeline."""
--- a/omop/scripts/generate_sample_data.py
+++ b/omop/scripts/generate_sample_data.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+"""
+Generate Sample Data for OMOP Pipeline Testing
+
+This script generates fictional healthcare data and loads it into staging tables.
+It creates realistic but completely fake patient, visit, condition, and drug data.
+"""
+
+import sys
+import os
+from pathlib import Path
+from datetime import datetime, timedelta
+import random
+from faker import Faker
+from sqlalchemy import text
+import psycopg2
+
+# Database configuration
+DB_CONFIG = {
+    'host': 'localhost',
+    'port': 5432,
+    'database': 'omop_cdm',
+    'user': 'dom',
+    'password': 'loli'
+}
+
+# Initialize Faker for generating fake data
+fake = Faker('fr_FR')  # French locale
+Faker.seed(42)  # For reproducibility
+random.seed(42)
+
+# Sample medical codes
+ICD10_CODES = [
+    ('E11.9', 'Diabète de type 2 sans complication'),
+    ('I10', 'Hypertension essentielle'),
+    ('J45.9', 'Asthme non précisé'),
+    ('M79.3', 'Panniculite non précisée'),
+    ('K21.9', 'Reflux gastro-oesophagien sans oesophagite'),
+]
+
+ATC_CODES = [
+    ('A10BA02', 'Metformine'),
+    ('C09AA02', 'Enalapril'),
+    ('R03AC02', 'Salbutamol'),
+    ('A02BC01', 'Oméprazole'),
+    ('N02BE01', 'Paracétamol'),
+]
+
+VISIT_TYPES = [
+    ('consultation', 'Consultation externe'),
+    ('urgence', 'Urgence'),
+    ('hospitalisation', 'Hospitalisation'),
+]
+
+
+def generate_patients(num_patients=100):
+    """Generate fake patient data."""
+    patients = []
+    
+    for i in range(num_patients):
+        birth_date = fake.date_of_birth(minimum_age=18, maximum_age=90)
+        
+        patient = {
+            'source_patient_id': f'PAT{i+1:05d}',
+            'date_naissance': birth_date,
+            'sexe': random.choice(['M', 'F']),
+            'code_postal': fake.postcode(),
+            'source_fichier': 'sample_data_generation',
+            'statut_traitement': 'pending'
+        }
+        patients.append(patient)
+    
+    return patients
+
+
+def generate_visits(patients, visits_per_patient=3):
+    """Generate fake visit data."""
+    visits = []
+    visit_id = 1
+    
+    for patient in patients:
+        num_visits = random.randint(1, visits_per_patient)
+        
+        for _ in range(num_visits):
+            visit_type, visit_desc = random.choice(VISIT_TYPES)
+            
+            # Generate visit dates (within last 2 years)
+            days_ago = random.randint(1, 730)
+            visit_start = datetime.now() - timedelta(days=days_ago)
+            
+            # Visit duration
+            if visit_type == 'hospitalisation':
+                duration = random.randint(1, 14)
+            elif visit_type == 'urgence':
+                duration = random.randint(0, 1)
+            else:
+                duration = 0
+            
+            visit_end = visit_start + timedelta(days=duration)
+            
+            visit = {
+                'source_visit_id': f'VIS{visit_id:06d}',
+                'source_patient_id': patient['source_patient_id'],
+                'type_visite': visit_type,
+                'date_debut': visit_start,
+                'date_fin': visit_end,
+                'source_fichier': 'sample_data_generation',
+                'statut_traitement': 'pending'
+            }
+            visits.append(visit)
+            visit_id += 1
+    
+    return visits
+
+
+def generate_conditions(visits):
+    """Generate fake condition/diagnosis data."""
+    conditions = []
+    condition_id = 1
+    
+    for visit in visits:
+        # 70% chance of having a condition
+        if random.random() < 0.7:
+            num_conditions = random.randint(1, 2)
+            
+            for _ in range(num_conditions):
+                code, description = random.choice(ICD10_CODES)
+                
+                condition = {
+                    'source_condition_id': f'COND{condition_id:06d}',
+                    'source_patient_id': visit['source_patient_id'],
+                    'source_visit_id': visit['source_visit_id'],
+                    'code_diagnostic': code,
+                    'systeme_codage': 'ICD10',
+                    'date_diagnostic': visit['date_debut'].date(),
+                    'source_fichier': 'sample_data_generation',
+                    'statut_traitement': 'pending'
+                }
+                conditions.append(condition)
+                condition_id += 1
+    
+    return conditions
+
+
+def generate_drugs(visits):
+    """Generate fake drug prescription data."""
+    drugs = []
+    drug_id = 1
+    
+    for visit in visits:
+        # 60% chance of having a drug prescription
+        if random.random() < 0.6:
+            num_drugs = random.randint(1, 3)
+            
+            for _ in range(num_drugs):
+                code, description = random.choice(ATC_CODES)
+                
+                drug_start = visit['date_debut']
+                duration = random.randint(7, 90)
+                drug_end = drug_start + timedelta(days=duration)
+                
+                drug = {
+                    'source_drug_id': f'DRUG{drug_id:06d}',
+                    'source_patient_id': visit['source_patient_id'],
+                    'source_visit_id': visit['source_visit_id'],
+                    'code_medicament': code,
+                    'systeme_codage': 'ATC',
+                    'date_debut': drug_start.date(),
+                    'date_fin': drug_end.date(),
+                    'quantite': random.randint(1, 3),
+                    'duree_traitement': duration,
+                    'source_fichier': 'sample_data_generation',
+                    'statut_traitement': 'pending'
+                }
+                drugs.append(drug)
+                drug_id += 1
+    
+    return drugs
+
+
+def load_data_to_staging(patients, visits, conditions, drugs):
+    """Load generated data into staging tables."""
+    conn = psycopg2.connect(**DB_CONFIG)
+    cursor = conn.cursor()
+    
+    try:
+        # Load patients
+        print(f"Loading {len(patients)} patients...")
+        for patient in patients:
+            cursor.execute("""
+                INSERT INTO staging.raw_patients 
+                    (source_patient_id, date_naissance, sexe, code_postal, 
+                     source_fichier, statut_traitement)
+                VALUES 
+                    (%s, %s, %s, %s, %s, %s)
+            """, (
+                patient['source_patient_id'],
+                patient['date_naissance'],
+                patient['sexe'],
+                patient['code_postal'],
+                patient['source_fichier'],
+                patient['statut_traitement']
+            ))
+        
+        # Load visits
+        print(f"Loading {len(visits)} visits...")
+        for visit in visits:
+            cursor.execute("""
+                INSERT INTO staging.raw_visits
+                    (source_visit_id, source_patient_id, type_visite, 
+                     date_debut, date_fin, source_fichier, statut_traitement)
+                VALUES
+                    (%s, %s, %s, %s, %s, %s, %s)
+            """, (
+                visit['source_visit_id'],
+                visit['source_patient_id'],
+                visit['type_visite'],
+                visit['date_debut'],
+                visit['date_fin'],
+                visit['source_fichier'],
+                visit['statut_traitement']
+            ))
+        
+        # Load conditions
+        print(f"Loading {len(conditions)} conditions...")
+        for condition in conditions:
+            cursor.execute("""
+                INSERT INTO staging.raw_conditions
+                    (source_condition_id, source_patient_id, source_visit_id,
+                     code_diagnostic, systeme_codage, date_diagnostic,
+                     source_fichier, statut_traitement)
+                VALUES
+                    (%s, %s, %s, %s, %s, %s, %s, %s)
+            """, (
+                condition['source_condition_id'],
+                condition['source_patient_id'],
+                condition['source_visit_id'],
+                condition['code_diagnostic'],
+                condition['systeme_codage'],
+                condition['date_diagnostic'],
+                condition['source_fichier'],
+                condition['statut_traitement']
+            ))
+        
+        # Load drugs
+        print(f"Loading {len(drugs)} drug prescriptions...")
+        for drug in drugs:
+            cursor.execute("""
+                INSERT INTO staging.raw_drugs
+                    (source_drug_id, source_patient_id, source_visit_id,
+                     code_medicament, systeme_codage, date_debut, date_fin,
+                     quantite, source_fichier, statut_traitement)
+                VALUES
+                    (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            """, (
+                drug['source_drug_id'],
+                drug['source_patient_id'],
+                drug['source_visit_id'],
+                drug['code_medicament'],
+                drug['systeme_codage'],
+                drug['date_debut'],
+                drug['date_fin'],
+                drug['quantite'],
+                drug['source_fichier'],
+                drug['statut_traitement']
+            ))
+        
+        conn.commit()
+        print("✓ All sample data loaded successfully!")
+        
+        # Print summary
+        print("\n" + "="*60)
+        print("SAMPLE DATA GENERATION SUMMARY")
+        print("="*60)
+        print(f"Patients:           {len(patients)}")
+        print(f"Visits:             {len(visits)}")
+        print(f"Conditions:         {len(conditions)}")
+        print(f"Drug prescriptions: {len(drugs)}")
+        print("="*60)
+        print("\nData loaded into staging tables with status 'pending'")
+        print("Ready for ETL processing!")
+        print("="*60)
+        
+    except Exception as e:
+        conn.rollback()
+        print(f"Error loading data: {str(e)}")
+        raise
+    finally:
+        cursor.close()
+        conn.close()
+
+
+def main():
+    """Main function."""
+    print("Generating sample healthcare data...")
+    print("="*60)
+    
+    # Configuration
+    num_patients = 100
+    visits_per_patient = 3
+    
+    # Generate data
+    print(f"Generating {num_patients} patients...")
+    patients = generate_patients(num_patients)
+    
+    print(f"Generating visits (avg {visits_per_patient} per patient)...")
+    visits = generate_visits(patients, visits_per_patient)
+    
+    print("Generating conditions/diagnoses...")
+    conditions = generate_conditions(visits)
+    
+    print("Generating drug prescriptions...")
+    drugs = generate_drugs(visits)
+    
+    print("\nData generation complete!")
+    print(f"  - {len(patients)} patients")
+    print(f"  - {len(visits)} visits")
+    print(f"  - {len(conditions)} conditions")
+    print(f"  - {len(drugs)} drug prescriptions")
+    
+    # Load data
+    print("\nConnecting to database and loading data...")
+    load_data_to_staging(patients, visits, conditions, drugs)
+    
+    print("\n✓ Sample data generation complete!")
+    print("\nNext steps:")
+    print("  1. Run ETL pipeline: omop-pipeline etl run --source staging.raw_patients --target person")
+    print("  2. Check results: omop-pipeline stats show")
+
+
+if __name__ == '__main__':
+    main()
--- a/omop/scripts/load_sample_data.sh
+++ b/omop/scripts/load_sample_data.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Load Sample Data Script
+# This script sets up the database and loads sample data for testing
+
+set -e
+
+echo "=========================================="
+echo "OMOP Sample Data Loading Script"
+echo "=========================================="
+echo ""
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Check if we're in the right directory
+if [ ! -f "setup.py" ]; then
+    echo -e "${RED}Error: Must be run from omop directory${NC}"
+    exit 1
+fi
+
+# Step 1: Install dependencies
+echo -e "${YELLOW}Step 1: Installing dependencies...${NC}"
+pip install faker > /dev/null 2>&1 || echo "Faker already installed"
+echo -e "${GREEN}✓ Dependencies installed${NC}"
+echo ""
+
+# Step 2: Create database schemas
+echo -e "${YELLOW}Step 2: Creating database schemas...${NC}"
+python -m src.cli.commands schema create --type all 2>/dev/null || echo "Schemas may already exist"
+echo -e "${GREEN}✓ Schemas ready${NC}"
+echo ""
+
+# Step 3: Generate and load sample data
+echo -e "${YELLOW}Step 3: Generating and loading sample data...${NC}"
+python scripts/generate_sample_data.py
+echo -e "${GREEN}✓ Sample data loaded${NC}"
+echo ""
+
+# Step 4: Verify data
+echo -e "${YELLOW}Step 4: Verifying loaded data...${NC}"
+python -c "
+from src.utils.config import Config
+from src.utils.db_connection import DatabaseConnection
+from sqlalchemy import text
+
+config = Config.load('config.yaml')
+db = DatabaseConnection(config)
+
+with db.get_session() as session:
+    # Count records in staging tables
+    tables = ['raw_patients', 'raw_visits', 'raw_conditions', 'raw_drugs']
+    
+    print('\nStaging Table Counts:')
+    print('-' * 40)
+    for table in tables:
+        query = text(f'SELECT COUNT(*) FROM staging.{table}')
+        count = session.execute(query).fetchone()[0]
+        print(f'  staging.{table:20s}: {count:5d} records')
+    print('-' * 40)
+"
+echo -e "${GREEN}✓ Data verification complete${NC}"
+echo ""
+
+echo "=========================================="
+echo -e "${GREEN}Sample data loading complete!${NC}"
+echo "=========================================="
+echo ""
+echo "Next steps:"
+echo "  1. Run ETL pipeline:"
+echo "     omop-pipeline etl run --source staging.raw_patients --target person"
+echo ""
+echo "  2. View statistics:"
+echo "     omop-pipeline stats show"
+echo ""
+echo "  3. Validate data:"
+echo "     omop-pipeline validate"
+echo ""
--- a/omop/scripts/load_vocabularies.sh
+++ b/omop/scripts/load_vocabularies.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Vocabulary Loading Script for OMOP Data Pipeline
+# This script downloads and loads OMOP vocabularies
+
+set -e  # Exit on error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
+ATHENA_URL="https://athena.ohdsi.org/"
+
+echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
+echo "================================"
+echo "Vocabulary directory: $VOCAB_DIR"
+echo "================================"
+echo ""
+
+# Check if vocabulary directory exists
+if [ ! -d "$VOCAB_DIR" ]; then
+    echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
+    echo ""
+    echo "To download OMOP vocabularies:"
+    echo "1. Visit $ATHENA_URL"
+    echo "2. Select the vocabularies you need"
+    echo "3. Download the vocabulary bundle"
+    echo "4. Extract to $VOCAB_DIR"
+    echo ""
+    echo "Required vocabularies for basic functionality:"
+    echo "  - SNOMED"
+    echo "  - ICD10CM"
+    echo "  - RxNorm"
+    echo "  - LOINC"
+    echo "  - CPT4"
+    echo ""
+    exit 1
+fi
+
+# Check for required vocabulary files
+echo -e "${YELLOW}Checking vocabulary files...${NC}"
+REQUIRED_FILES=(
+    "CONCEPT.csv"
+    "VOCABULARY.csv"
+    "DOMAIN.csv"
+    "CONCEPT_CLASS.csv"
+    "CONCEPT_RELATIONSHIP.csv"
+    "RELATIONSHIP.csv"
+)
+
+MISSING_FILES=()
+for file in "${REQUIRED_FILES[@]}"; do
+    if [ ! -f "$VOCAB_DIR/$file" ]; then
+        MISSING_FILES+=("$file")
+    fi
+done
+
+if [ ${#MISSING_FILES[@]} -gt 0 ]; then
+    echo -e "${RED}Error: Missing required vocabulary files:${NC}"
+    for file in "${MISSING_FILES[@]}"; do
+        echo "  - $file"
+    done
+    echo ""
+    echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ All required vocabulary files found${NC}"
+echo ""
+
+# Count records in vocabulary files
+echo -e "${YELLOW}Vocabulary file statistics:${NC}"
+for file in "${REQUIRED_FILES[@]}"; do
+    if [ -f "$VOCAB_DIR/$file" ]; then
+        count=$(wc -l < "$VOCAB_DIR/$file")
+        echo "  $file: $((count - 1)) records"
+    fi
+done
+echo ""
+
+# Load vocabularies using Python CLI
+echo -e "${YELLOW}Loading vocabularies into database...${NC}"
+echo "This may take several minutes depending on vocabulary size..."
+echo ""
+
+if command -v omop-pipeline &> /dev/null; then
+    omop-pipeline vocab load --path "$VOCAB_DIR"
+    echo ""
+    echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
+else
+    echo -e "${RED}Error: omop-pipeline command not found${NC}"
+    echo "Please install the package with: pip install -e ."
+    exit 1
+fi
+
+echo ""
+echo -e "${GREEN}================================${NC}"
+echo -e "${GREEN}Vocabulary loading completed!${NC}"
+echo -e "${GREEN}================================${NC}"
+echo ""
+echo "You can now run the ETL pipeline:"
+echo "  omop-pipeline etl run --source staging.raw_patients --target person"
+echo ""
--- a/omop/scripts/run_tests.sh
+++ b/omop/scripts/run_tests.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Test Execution Script for OMOP Data Pipeline
+# This script runs all tests with coverage reporting
+
+set -e  # Exit on error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}OMOP Pipeline Test Suite${NC}"
+echo "================================"
+echo ""
+
+# Check if pytest is installed
+if ! command -v pytest &> /dev/null; then
+    echo -e "${RED}Error: pytest not found${NC}"
+    echo "Please install test dependencies:"
+    echo "  pip install -e .[test]"
+    exit 1
+fi
+
+# Run tests with coverage
+echo -e "${YELLOW}Running tests with coverage...${NC}"
+echo ""
+
+pytest \
+    --verbose \
+    --cov=src \
+    --cov-report=html \
+    --cov-report=term \
+    --cov-report=xml \
+    tests/
+
+TEST_EXIT_CODE=$?
+
+echo ""
+if [ $TEST_EXIT_CODE -eq 0 ]; then
+    echo -e "${GREEN}================================${NC}"
+    echo -e "${GREEN}All tests passed!${NC}"
+    echo -e "${GREEN}================================${NC}"
+    echo ""
+    echo "Coverage report generated:"
+    echo "  HTML: htmlcov/index.html"
+    echo "  XML: coverage.xml"
+    echo ""
+else
+    echo -e "${RED}================================${NC}"
+    echo -e "${RED}Some tests failed${NC}"
+    echo -e "${RED}================================${NC}"
+    echo ""
+    exit $TEST_EXIT_CODE
+fi
+
+# Optional: Run linting
+if command -v flake8 &> /dev/null; then
+    echo -e "${YELLOW}Running code quality checks...${NC}"
+    flake8 src/ --max-line-length=100 --exclude=__pycache__,*.pyc
+    echo -e "${GREEN}✓ Code quality checks passed${NC}"
+    echo ""
+fi
+
+# Optional: Run type checking
+if command -v mypy &> /dev/null; then
+    echo -e "${YELLOW}Running type checks...${NC}"
+    mypy src/ --ignore-missing-imports
+    echo -e "${GREEN}✓ Type checks passed${NC}"
+    echo ""
+fi
+
+echo -e "${GREEN}Test suite completed successfully!${NC}"
--- a/omop/scripts/setup_database.sh
+++ b/omop/scripts/setup_database.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Database Setup Script for OMOP Data Pipeline
+# This script creates the database and schemas for the OMOP pipeline
+
+set -e  # Exit on error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration (can be overridden by environment variables)
+DB_HOST="${DB_HOST:-localhost}"
+DB_PORT="${DB_PORT:-5432}"
+DB_NAME="${DB_NAME:-omop_db}"
+DB_USER="${DB_USER:-postgres}"
+DB_PASSWORD="${DB_PASSWORD:-}"
+ADMIN_USER="${ADMIN_USER:-postgres}"
+
+echo -e "${GREEN}OMOP Database Setup${NC}"
+echo "================================"
+echo "Host: $DB_HOST"
+echo "Port: $DB_PORT"
+echo "Database: $DB_NAME"
+echo "User: $DB_USER"
+echo "================================"
+echo ""
+
+# Check if PostgreSQL is running
+echo -e "${YELLOW}Checking PostgreSQL connection...${NC}"
+if ! pg_isready -h "$DB_HOST" -p "$DB_PORT" > /dev/null 2>&1; then
+    echo -e "${RED}Error: Cannot connect to PostgreSQL at $DB_HOST:$DB_PORT${NC}"
+    echo "Please ensure PostgreSQL is running and accessible."
+    exit 1
+fi
+echo -e "${GREEN}✓ PostgreSQL is running${NC}"
+echo ""
+
+# Create database if it doesn't exist
+echo -e "${YELLOW}Creating database...${NC}"
+if PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -lqt | cut -d \| -f 1 | grep -qw "$DB_NAME"; then
+    echo -e "${YELLOW}Database $DB_NAME already exists${NC}"
+else
+    PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -c "CREATE DATABASE $DB_NAME;"
+    echo -e "${GREEN}✓ Database $DB_NAME created${NC}"
+fi
+echo ""
+
+# Create user if it doesn't exist
+echo -e "${YELLOW}Creating database user...${NC}"
+if PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" -tAc "SELECT 1 FROM pg_roles WHERE rolname='$DB_USER'" | grep -q 1; then
+    echo -e "${YELLOW}User $DB_USER already exists${NC}"
+else
+    PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" -c "CREATE USER $DB_USER WITH PASSWORD '$DB_PASSWORD';"
+    echo -e "${GREEN}✓ User $DB_USER created${NC}"
+fi
+echo ""
+
+# Grant privileges
+echo -e "${YELLOW}Granting privileges...${NC}"
+PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" <<EOF
+GRANT ALL PRIVILEGES ON DATABASE $DB_NAME TO $DB_USER;
+GRANT ALL ON SCHEMA public TO $DB_USER;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO $DB_USER;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO $DB_USER;
+EOF
+echo -e "${GREEN}✓ Privileges granted${NC}"
+echo ""
+
+# Create schemas using the Python CLI
+echo -e "${YELLOW}Creating OMOP schemas...${NC}"
+if command -v omop-pipeline &> /dev/null; then
+    omop-pipeline schema create --type all
+    echo -e "${GREEN}✓ OMOP schemas created${NC}"
+else
+    echo -e "${YELLOW}Warning: omop-pipeline command not found${NC}"
+    echo "Please install the package with: pip install -e ."
+    echo "Then run: omop-pipeline schema create --type all"
+fi
+echo ""
+
+echo -e "${GREEN}================================${NC}"
+echo -e "${GREEN}Database setup completed!${NC}"
+echo -e "${GREEN}================================${NC}"
+echo ""
+echo "Next steps:"
+echo "1. Load vocabularies: omop-pipeline vocab load --path /path/to/vocabularies"
+echo "2. Load staging data into staging tables"
+echo "3. Run ETL: omop-pipeline etl run --source staging.raw_patients --target person"
+echo ""