Initial commit

This commit is contained in:
Dom
2026-03-05 01:20:15 +01:00
commit c0c50e56f0
364 changed files with 62207 additions and 0 deletions

1
omop/scripts/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Utility scripts for OMOP pipeline."""

View File

@@ -0,0 +1,332 @@
#!/usr/bin/env python3
"""
Generate Sample Data for OMOP Pipeline Testing
This script generates fictional healthcare data and loads it into staging tables.
It creates realistic but completely fake patient, visit, condition, and drug data.
"""
import sys
import os
from pathlib import Path
from datetime import datetime, timedelta
import random
from faker import Faker
from sqlalchemy import text
import psycopg2
# Database configuration
DB_CONFIG = {
'host': 'localhost',
'port': 5432,
'database': 'omop_cdm',
'user': 'dom',
'password': 'loli'
}
# Initialize Faker for generating fake data
fake = Faker('fr_FR') # French locale
Faker.seed(42) # For reproducibility
random.seed(42)
# Sample medical codes
ICD10_CODES = [
('E11.9', 'Diabète de type 2 sans complication'),
('I10', 'Hypertension essentielle'),
('J45.9', 'Asthme non précisé'),
('M79.3', 'Panniculite non précisée'),
('K21.9', 'Reflux gastro-oesophagien sans oesophagite'),
]
ATC_CODES = [
('A10BA02', 'Metformine'),
('C09AA02', 'Enalapril'),
('R03AC02', 'Salbutamol'),
('A02BC01', 'Oméprazole'),
('N02BE01', 'Paracétamol'),
]
VISIT_TYPES = [
('consultation', 'Consultation externe'),
('urgence', 'Urgence'),
('hospitalisation', 'Hospitalisation'),
]
def generate_patients(num_patients=100):
"""Generate fake patient data."""
patients = []
for i in range(num_patients):
birth_date = fake.date_of_birth(minimum_age=18, maximum_age=90)
patient = {
'source_patient_id': f'PAT{i+1:05d}',
'date_naissance': birth_date,
'sexe': random.choice(['M', 'F']),
'code_postal': fake.postcode(),
'source_fichier': 'sample_data_generation',
'statut_traitement': 'pending'
}
patients.append(patient)
return patients
def generate_visits(patients, visits_per_patient=3):
"""Generate fake visit data."""
visits = []
visit_id = 1
for patient in patients:
num_visits = random.randint(1, visits_per_patient)
for _ in range(num_visits):
visit_type, visit_desc = random.choice(VISIT_TYPES)
# Generate visit dates (within last 2 years)
days_ago = random.randint(1, 730)
visit_start = datetime.now() - timedelta(days=days_ago)
# Visit duration
if visit_type == 'hospitalisation':
duration = random.randint(1, 14)
elif visit_type == 'urgence':
duration = random.randint(0, 1)
else:
duration = 0
visit_end = visit_start + timedelta(days=duration)
visit = {
'source_visit_id': f'VIS{visit_id:06d}',
'source_patient_id': patient['source_patient_id'],
'type_visite': visit_type,
'date_debut': visit_start,
'date_fin': visit_end,
'source_fichier': 'sample_data_generation',
'statut_traitement': 'pending'
}
visits.append(visit)
visit_id += 1
return visits
def generate_conditions(visits):
"""Generate fake condition/diagnosis data."""
conditions = []
condition_id = 1
for visit in visits:
# 70% chance of having a condition
if random.random() < 0.7:
num_conditions = random.randint(1, 2)
for _ in range(num_conditions):
code, description = random.choice(ICD10_CODES)
condition = {
'source_condition_id': f'COND{condition_id:06d}',
'source_patient_id': visit['source_patient_id'],
'source_visit_id': visit['source_visit_id'],
'code_diagnostic': code,
'systeme_codage': 'ICD10',
'date_diagnostic': visit['date_debut'].date(),
'source_fichier': 'sample_data_generation',
'statut_traitement': 'pending'
}
conditions.append(condition)
condition_id += 1
return conditions
def generate_drugs(visits):
"""Generate fake drug prescription data."""
drugs = []
drug_id = 1
for visit in visits:
# 60% chance of having a drug prescription
if random.random() < 0.6:
num_drugs = random.randint(1, 3)
for _ in range(num_drugs):
code, description = random.choice(ATC_CODES)
drug_start = visit['date_debut']
duration = random.randint(7, 90)
drug_end = drug_start + timedelta(days=duration)
drug = {
'source_drug_id': f'DRUG{drug_id:06d}',
'source_patient_id': visit['source_patient_id'],
'source_visit_id': visit['source_visit_id'],
'code_medicament': code,
'systeme_codage': 'ATC',
'date_debut': drug_start.date(),
'date_fin': drug_end.date(),
'quantite': random.randint(1, 3),
'duree_traitement': duration,
'source_fichier': 'sample_data_generation',
'statut_traitement': 'pending'
}
drugs.append(drug)
drug_id += 1
return drugs
def load_data_to_staging(patients, visits, conditions, drugs):
"""Load generated data into staging tables."""
conn = psycopg2.connect(**DB_CONFIG)
cursor = conn.cursor()
try:
# Load patients
print(f"Loading {len(patients)} patients...")
for patient in patients:
cursor.execute("""
INSERT INTO staging.raw_patients
(source_patient_id, date_naissance, sexe, code_postal,
source_fichier, statut_traitement)
VALUES
(%s, %s, %s, %s, %s, %s)
""", (
patient['source_patient_id'],
patient['date_naissance'],
patient['sexe'],
patient['code_postal'],
patient['source_fichier'],
patient['statut_traitement']
))
# Load visits
print(f"Loading {len(visits)} visits...")
for visit in visits:
cursor.execute("""
INSERT INTO staging.raw_visits
(source_visit_id, source_patient_id, type_visite,
date_debut, date_fin, source_fichier, statut_traitement)
VALUES
(%s, %s, %s, %s, %s, %s, %s)
""", (
visit['source_visit_id'],
visit['source_patient_id'],
visit['type_visite'],
visit['date_debut'],
visit['date_fin'],
visit['source_fichier'],
visit['statut_traitement']
))
# Load conditions
print(f"Loading {len(conditions)} conditions...")
for condition in conditions:
cursor.execute("""
INSERT INTO staging.raw_conditions
(source_condition_id, source_patient_id, source_visit_id,
code_diagnostic, systeme_codage, date_diagnostic,
source_fichier, statut_traitement)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s)
""", (
condition['source_condition_id'],
condition['source_patient_id'],
condition['source_visit_id'],
condition['code_diagnostic'],
condition['systeme_codage'],
condition['date_diagnostic'],
condition['source_fichier'],
condition['statut_traitement']
))
# Load drugs
print(f"Loading {len(drugs)} drug prescriptions...")
for drug in drugs:
cursor.execute("""
INSERT INTO staging.raw_drugs
(source_drug_id, source_patient_id, source_visit_id,
code_medicament, systeme_codage, date_debut, date_fin,
quantite, source_fichier, statut_traitement)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (
drug['source_drug_id'],
drug['source_patient_id'],
drug['source_visit_id'],
drug['code_medicament'],
drug['systeme_codage'],
drug['date_debut'],
drug['date_fin'],
drug['quantite'],
drug['source_fichier'],
drug['statut_traitement']
))
conn.commit()
print("✓ All sample data loaded successfully!")
# Print summary
print("\n" + "="*60)
print("SAMPLE DATA GENERATION SUMMARY")
print("="*60)
print(f"Patients: {len(patients)}")
print(f"Visits: {len(visits)}")
print(f"Conditions: {len(conditions)}")
print(f"Drug prescriptions: {len(drugs)}")
print("="*60)
print("\nData loaded into staging tables with status 'pending'")
print("Ready for ETL processing!")
print("="*60)
except Exception as e:
conn.rollback()
print(f"Error loading data: {str(e)}")
raise
finally:
cursor.close()
conn.close()
def main():
"""Main function."""
print("Generating sample healthcare data...")
print("="*60)
# Configuration
num_patients = 100
visits_per_patient = 3
# Generate data
print(f"Generating {num_patients} patients...")
patients = generate_patients(num_patients)
print(f"Generating visits (avg {visits_per_patient} per patient)...")
visits = generate_visits(patients, visits_per_patient)
print("Generating conditions/diagnoses...")
conditions = generate_conditions(visits)
print("Generating drug prescriptions...")
drugs = generate_drugs(visits)
print("\nData generation complete!")
print(f" - {len(patients)} patients")
print(f" - {len(visits)} visits")
print(f" - {len(conditions)} conditions")
print(f" - {len(drugs)} drug prescriptions")
# Load data
print("\nConnecting to database and loading data...")
load_data_to_staging(patients, visits, conditions, drugs)
print("\n✓ Sample data generation complete!")
print("\nNext steps:")
print(" 1. Run ETL pipeline: omop-pipeline etl run --source staging.raw_patients --target person")
print(" 2. Check results: omop-pipeline stats show")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,80 @@
#!/bin/bash
# Load Sample Data Script
# This script sets up the database and loads sample data for testing
set -e
echo "=========================================="
echo "OMOP Sample Data Loading Script"
echo "=========================================="
echo ""
# Colors for output
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
# Check if we're in the right directory
if [ ! -f "setup.py" ]; then
echo -e "${RED}Error: Must be run from omop directory${NC}"
exit 1
fi
# Step 1: Install dependencies
echo -e "${YELLOW}Step 1: Installing dependencies...${NC}"
pip install faker > /dev/null 2>&1 || echo "Faker already installed"
echo -e "${GREEN}✓ Dependencies installed${NC}"
echo ""
# Step 2: Create database schemas
echo -e "${YELLOW}Step 2: Creating database schemas...${NC}"
python -m src.cli.commands schema create --type all 2>/dev/null || echo "Schemas may already exist"
echo -e "${GREEN}✓ Schemas ready${NC}"
echo ""
# Step 3: Generate and load sample data
echo -e "${YELLOW}Step 3: Generating and loading sample data...${NC}"
python scripts/generate_sample_data.py
echo -e "${GREEN}✓ Sample data loaded${NC}"
echo ""
# Step 4: Verify data
echo -e "${YELLOW}Step 4: Verifying loaded data...${NC}"
python -c "
from src.utils.config import Config
from src.utils.db_connection import DatabaseConnection
from sqlalchemy import text
config = Config.load('config.yaml')
db = DatabaseConnection(config)
with db.get_session() as session:
# Count records in staging tables
tables = ['raw_patients', 'raw_visits', 'raw_conditions', 'raw_drugs']
print('\nStaging Table Counts:')
print('-' * 40)
for table in tables:
query = text(f'SELECT COUNT(*) FROM staging.{table}')
count = session.execute(query).fetchone()[0]
print(f' staging.{table:20s}: {count:5d} records')
print('-' * 40)
"
echo -e "${GREEN}✓ Data verification complete${NC}"
echo ""
echo "=========================================="
echo -e "${GREEN}Sample data loading complete!${NC}"
echo "=========================================="
echo ""
echo "Next steps:"
echo " 1. Run ETL pipeline:"
echo " omop-pipeline etl run --source staging.raw_patients --target person"
echo ""
echo " 2. View statistics:"
echo " omop-pipeline stats show"
echo ""
echo " 3. Validate data:"
echo " omop-pipeline validate"
echo ""

106
omop/scripts/load_vocabularies.sh Executable file
View File

@@ -0,0 +1,106 @@
#!/bin/bash
# Vocabulary Loading Script for OMOP Data Pipeline
# This script downloads and loads OMOP vocabularies
set -e # Exit on error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
ATHENA_URL="https://athena.ohdsi.org/"
echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
echo "================================"
echo "Vocabulary directory: $VOCAB_DIR"
echo "================================"
echo ""
# Check if vocabulary directory exists
if [ ! -d "$VOCAB_DIR" ]; then
echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
echo ""
echo "To download OMOP vocabularies:"
echo "1. Visit $ATHENA_URL"
echo "2. Select the vocabularies you need"
echo "3. Download the vocabulary bundle"
echo "4. Extract to $VOCAB_DIR"
echo ""
echo "Required vocabularies for basic functionality:"
echo " - SNOMED"
echo " - ICD10CM"
echo " - RxNorm"
echo " - LOINC"
echo " - CPT4"
echo ""
exit 1
fi
# Check for required vocabulary files
echo -e "${YELLOW}Checking vocabulary files...${NC}"
REQUIRED_FILES=(
"CONCEPT.csv"
"VOCABULARY.csv"
"DOMAIN.csv"
"CONCEPT_CLASS.csv"
"CONCEPT_RELATIONSHIP.csv"
"RELATIONSHIP.csv"
)
MISSING_FILES=()
for file in "${REQUIRED_FILES[@]}"; do
if [ ! -f "$VOCAB_DIR/$file" ]; then
MISSING_FILES+=("$file")
fi
done
if [ ${#MISSING_FILES[@]} -gt 0 ]; then
echo -e "${RED}Error: Missing required vocabulary files:${NC}"
for file in "${MISSING_FILES[@]}"; do
echo " - $file"
done
echo ""
echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
exit 1
fi
echo -e "${GREEN}✓ All required vocabulary files found${NC}"
echo ""
# Count records in vocabulary files
echo -e "${YELLOW}Vocabulary file statistics:${NC}"
for file in "${REQUIRED_FILES[@]}"; do
if [ -f "$VOCAB_DIR/$file" ]; then
count=$(wc -l < "$VOCAB_DIR/$file")
echo " $file: $((count - 1)) records"
fi
done
echo ""
# Load vocabularies using Python CLI
echo -e "${YELLOW}Loading vocabularies into database...${NC}"
echo "This may take several minutes depending on vocabulary size..."
echo ""
if command -v omop-pipeline &> /dev/null; then
omop-pipeline vocab load --path "$VOCAB_DIR"
echo ""
echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
else
echo -e "${RED}Error: omop-pipeline command not found${NC}"
echo "Please install the package with: pip install -e ."
exit 1
fi
echo ""
echo -e "${GREEN}================================${NC}"
echo -e "${GREEN}Vocabulary loading completed!${NC}"
echo -e "${GREEN}================================${NC}"
echo ""
echo "You can now run the ETL pipeline:"
echo " omop-pipeline etl run --source staging.raw_patients --target person"
echo ""

73
omop/scripts/run_tests.sh Executable file
View File

@@ -0,0 +1,73 @@
#!/bin/bash
# Test Execution Script for OMOP Data Pipeline
# This script runs all tests with coverage reporting
set -e # Exit on error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${GREEN}OMOP Pipeline Test Suite${NC}"
echo "================================"
echo ""
# Check if pytest is installed
if ! command -v pytest &> /dev/null; then
echo -e "${RED}Error: pytest not found${NC}"
echo "Please install test dependencies:"
echo " pip install -e .[test]"
exit 1
fi
# Run tests with coverage
echo -e "${YELLOW}Running tests with coverage...${NC}"
echo ""
pytest \
--verbose \
--cov=src \
--cov-report=html \
--cov-report=term \
--cov-report=xml \
tests/
TEST_EXIT_CODE=$?
echo ""
if [ $TEST_EXIT_CODE -eq 0 ]; then
echo -e "${GREEN}================================${NC}"
echo -e "${GREEN}All tests passed!${NC}"
echo -e "${GREEN}================================${NC}"
echo ""
echo "Coverage report generated:"
echo " HTML: htmlcov/index.html"
echo " XML: coverage.xml"
echo ""
else
echo -e "${RED}================================${NC}"
echo -e "${RED}Some tests failed${NC}"
echo -e "${RED}================================${NC}"
echo ""
exit $TEST_EXIT_CODE
fi
# Optional: Run linting
if command -v flake8 &> /dev/null; then
echo -e "${YELLOW}Running code quality checks...${NC}"
flake8 src/ --max-line-length=100 --exclude=__pycache__,*.pyc
echo -e "${GREEN}✓ Code quality checks passed${NC}"
echo ""
fi
# Optional: Run type checking
if command -v mypy &> /dev/null; then
echo -e "${YELLOW}Running type checks...${NC}"
mypy src/ --ignore-missing-imports
echo -e "${GREEN}✓ Type checks passed${NC}"
echo ""
fi
echo -e "${GREEN}Test suite completed successfully!${NC}"

91
omop/scripts/setup_database.sh Executable file
View File

@@ -0,0 +1,91 @@
#!/bin/bash
# Database Setup Script for OMOP Data Pipeline
# This script creates the database and schemas for the OMOP pipeline
set -e # Exit on error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration (can be overridden by environment variables)
DB_HOST="${DB_HOST:-localhost}"
DB_PORT="${DB_PORT:-5432}"
DB_NAME="${DB_NAME:-omop_db}"
DB_USER="${DB_USER:-postgres}"
DB_PASSWORD="${DB_PASSWORD:-}"
ADMIN_USER="${ADMIN_USER:-postgres}"
echo -e "${GREEN}OMOP Database Setup${NC}"
echo "================================"
echo "Host: $DB_HOST"
echo "Port: $DB_PORT"
echo "Database: $DB_NAME"
echo "User: $DB_USER"
echo "================================"
echo ""
# Check if PostgreSQL is running
echo -e "${YELLOW}Checking PostgreSQL connection...${NC}"
if ! pg_isready -h "$DB_HOST" -p "$DB_PORT" > /dev/null 2>&1; then
echo -e "${RED}Error: Cannot connect to PostgreSQL at $DB_HOST:$DB_PORT${NC}"
echo "Please ensure PostgreSQL is running and accessible."
exit 1
fi
echo -e "${GREEN}✓ PostgreSQL is running${NC}"
echo ""
# Create database if it doesn't exist
echo -e "${YELLOW}Creating database...${NC}"
if PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -lqt | cut -d \| -f 1 | grep -qw "$DB_NAME"; then
echo -e "${YELLOW}Database $DB_NAME already exists${NC}"
else
PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -c "CREATE DATABASE $DB_NAME;"
echo -e "${GREEN}✓ Database $DB_NAME created${NC}"
fi
echo ""
# Create user if it doesn't exist
echo -e "${YELLOW}Creating database user...${NC}"
if PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" -tAc "SELECT 1 FROM pg_roles WHERE rolname='$DB_USER'" | grep -q 1; then
echo -e "${YELLOW}User $DB_USER already exists${NC}"
else
PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" -c "CREATE USER $DB_USER WITH PASSWORD '$DB_PASSWORD';"
echo -e "${GREEN}✓ User $DB_USER created${NC}"
fi
echo ""
# Grant privileges
echo -e "${YELLOW}Granting privileges...${NC}"
PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" <<EOF
GRANT ALL PRIVILEGES ON DATABASE $DB_NAME TO $DB_USER;
GRANT ALL ON SCHEMA public TO $DB_USER;
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO $DB_USER;
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO $DB_USER;
EOF
echo -e "${GREEN}✓ Privileges granted${NC}"
echo ""
# Create schemas using the Python CLI
echo -e "${YELLOW}Creating OMOP schemas...${NC}"
if command -v omop-pipeline &> /dev/null; then
omop-pipeline schema create --type all
echo -e "${GREEN}✓ OMOP schemas created${NC}"
else
echo -e "${YELLOW}Warning: omop-pipeline command not found${NC}"
echo "Please install the package with: pip install -e ."
echo "Then run: omop-pipeline schema create --type all"
fi
echo ""
echo -e "${GREEN}================================${NC}"
echo -e "${GREEN}Database setup completed!${NC}"
echo -e "${GREEN}================================${NC}"
echo ""
echo "Next steps:"
echo "1. Load vocabularies: omop-pipeline vocab load --path /path/to/vocabularies"
echo "2. Load staging data into staging tables"
echo "3. Run ETL: omop-pipeline etl run --source staging.raw_patients --target person"
echo ""