Initial commit
This commit is contained in:
1
omop/scripts/__init__.py
Normal file
1
omop/scripts/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Utility scripts for OMOP pipeline."""
|
||||
332
omop/scripts/generate_sample_data.py
Executable file
332
omop/scripts/generate_sample_data.py
Executable file
@@ -0,0 +1,332 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate Sample Data for OMOP Pipeline Testing
|
||||
|
||||
This script generates fictional healthcare data and loads it into staging tables.
|
||||
It creates realistic but completely fake patient, visit, condition, and drug data.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
from faker import Faker
|
||||
from sqlalchemy import text
|
||||
import psycopg2
|
||||
|
||||
# Database configuration
|
||||
DB_CONFIG = {
|
||||
'host': 'localhost',
|
||||
'port': 5432,
|
||||
'database': 'omop_cdm',
|
||||
'user': 'dom',
|
||||
'password': 'loli'
|
||||
}
|
||||
|
||||
# Initialize Faker for generating fake data
|
||||
fake = Faker('fr_FR') # French locale
|
||||
Faker.seed(42) # For reproducibility
|
||||
random.seed(42)
|
||||
|
||||
# Sample medical codes
|
||||
ICD10_CODES = [
|
||||
('E11.9', 'Diabète de type 2 sans complication'),
|
||||
('I10', 'Hypertension essentielle'),
|
||||
('J45.9', 'Asthme non précisé'),
|
||||
('M79.3', 'Panniculite non précisée'),
|
||||
('K21.9', 'Reflux gastro-oesophagien sans oesophagite'),
|
||||
]
|
||||
|
||||
ATC_CODES = [
|
||||
('A10BA02', 'Metformine'),
|
||||
('C09AA02', 'Enalapril'),
|
||||
('R03AC02', 'Salbutamol'),
|
||||
('A02BC01', 'Oméprazole'),
|
||||
('N02BE01', 'Paracétamol'),
|
||||
]
|
||||
|
||||
VISIT_TYPES = [
|
||||
('consultation', 'Consultation externe'),
|
||||
('urgence', 'Urgence'),
|
||||
('hospitalisation', 'Hospitalisation'),
|
||||
]
|
||||
|
||||
|
||||
def generate_patients(num_patients=100):
|
||||
"""Generate fake patient data."""
|
||||
patients = []
|
||||
|
||||
for i in range(num_patients):
|
||||
birth_date = fake.date_of_birth(minimum_age=18, maximum_age=90)
|
||||
|
||||
patient = {
|
||||
'source_patient_id': f'PAT{i+1:05d}',
|
||||
'date_naissance': birth_date,
|
||||
'sexe': random.choice(['M', 'F']),
|
||||
'code_postal': fake.postcode(),
|
||||
'source_fichier': 'sample_data_generation',
|
||||
'statut_traitement': 'pending'
|
||||
}
|
||||
patients.append(patient)
|
||||
|
||||
return patients
|
||||
|
||||
|
||||
def generate_visits(patients, visits_per_patient=3):
|
||||
"""Generate fake visit data."""
|
||||
visits = []
|
||||
visit_id = 1
|
||||
|
||||
for patient in patients:
|
||||
num_visits = random.randint(1, visits_per_patient)
|
||||
|
||||
for _ in range(num_visits):
|
||||
visit_type, visit_desc = random.choice(VISIT_TYPES)
|
||||
|
||||
# Generate visit dates (within last 2 years)
|
||||
days_ago = random.randint(1, 730)
|
||||
visit_start = datetime.now() - timedelta(days=days_ago)
|
||||
|
||||
# Visit duration
|
||||
if visit_type == 'hospitalisation':
|
||||
duration = random.randint(1, 14)
|
||||
elif visit_type == 'urgence':
|
||||
duration = random.randint(0, 1)
|
||||
else:
|
||||
duration = 0
|
||||
|
||||
visit_end = visit_start + timedelta(days=duration)
|
||||
|
||||
visit = {
|
||||
'source_visit_id': f'VIS{visit_id:06d}',
|
||||
'source_patient_id': patient['source_patient_id'],
|
||||
'type_visite': visit_type,
|
||||
'date_debut': visit_start,
|
||||
'date_fin': visit_end,
|
||||
'source_fichier': 'sample_data_generation',
|
||||
'statut_traitement': 'pending'
|
||||
}
|
||||
visits.append(visit)
|
||||
visit_id += 1
|
||||
|
||||
return visits
|
||||
|
||||
|
||||
def generate_conditions(visits):
|
||||
"""Generate fake condition/diagnosis data."""
|
||||
conditions = []
|
||||
condition_id = 1
|
||||
|
||||
for visit in visits:
|
||||
# 70% chance of having a condition
|
||||
if random.random() < 0.7:
|
||||
num_conditions = random.randint(1, 2)
|
||||
|
||||
for _ in range(num_conditions):
|
||||
code, description = random.choice(ICD10_CODES)
|
||||
|
||||
condition = {
|
||||
'source_condition_id': f'COND{condition_id:06d}',
|
||||
'source_patient_id': visit['source_patient_id'],
|
||||
'source_visit_id': visit['source_visit_id'],
|
||||
'code_diagnostic': code,
|
||||
'systeme_codage': 'ICD10',
|
||||
'date_diagnostic': visit['date_debut'].date(),
|
||||
'source_fichier': 'sample_data_generation',
|
||||
'statut_traitement': 'pending'
|
||||
}
|
||||
conditions.append(condition)
|
||||
condition_id += 1
|
||||
|
||||
return conditions
|
||||
|
||||
|
||||
def generate_drugs(visits):
|
||||
"""Generate fake drug prescription data."""
|
||||
drugs = []
|
||||
drug_id = 1
|
||||
|
||||
for visit in visits:
|
||||
# 60% chance of having a drug prescription
|
||||
if random.random() < 0.6:
|
||||
num_drugs = random.randint(1, 3)
|
||||
|
||||
for _ in range(num_drugs):
|
||||
code, description = random.choice(ATC_CODES)
|
||||
|
||||
drug_start = visit['date_debut']
|
||||
duration = random.randint(7, 90)
|
||||
drug_end = drug_start + timedelta(days=duration)
|
||||
|
||||
drug = {
|
||||
'source_drug_id': f'DRUG{drug_id:06d}',
|
||||
'source_patient_id': visit['source_patient_id'],
|
||||
'source_visit_id': visit['source_visit_id'],
|
||||
'code_medicament': code,
|
||||
'systeme_codage': 'ATC',
|
||||
'date_debut': drug_start.date(),
|
||||
'date_fin': drug_end.date(),
|
||||
'quantite': random.randint(1, 3),
|
||||
'duree_traitement': duration,
|
||||
'source_fichier': 'sample_data_generation',
|
||||
'statut_traitement': 'pending'
|
||||
}
|
||||
drugs.append(drug)
|
||||
drug_id += 1
|
||||
|
||||
return drugs
|
||||
|
||||
|
||||
def load_data_to_staging(patients, visits, conditions, drugs):
|
||||
"""Load generated data into staging tables."""
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# Load patients
|
||||
print(f"Loading {len(patients)} patients...")
|
||||
for patient in patients:
|
||||
cursor.execute("""
|
||||
INSERT INTO staging.raw_patients
|
||||
(source_patient_id, date_naissance, sexe, code_postal,
|
||||
source_fichier, statut_traitement)
|
||||
VALUES
|
||||
(%s, %s, %s, %s, %s, %s)
|
||||
""", (
|
||||
patient['source_patient_id'],
|
||||
patient['date_naissance'],
|
||||
patient['sexe'],
|
||||
patient['code_postal'],
|
||||
patient['source_fichier'],
|
||||
patient['statut_traitement']
|
||||
))
|
||||
|
||||
# Load visits
|
||||
print(f"Loading {len(visits)} visits...")
|
||||
for visit in visits:
|
||||
cursor.execute("""
|
||||
INSERT INTO staging.raw_visits
|
||||
(source_visit_id, source_patient_id, type_visite,
|
||||
date_debut, date_fin, source_fichier, statut_traitement)
|
||||
VALUES
|
||||
(%s, %s, %s, %s, %s, %s, %s)
|
||||
""", (
|
||||
visit['source_visit_id'],
|
||||
visit['source_patient_id'],
|
||||
visit['type_visite'],
|
||||
visit['date_debut'],
|
||||
visit['date_fin'],
|
||||
visit['source_fichier'],
|
||||
visit['statut_traitement']
|
||||
))
|
||||
|
||||
# Load conditions
|
||||
print(f"Loading {len(conditions)} conditions...")
|
||||
for condition in conditions:
|
||||
cursor.execute("""
|
||||
INSERT INTO staging.raw_conditions
|
||||
(source_condition_id, source_patient_id, source_visit_id,
|
||||
code_diagnostic, systeme_codage, date_diagnostic,
|
||||
source_fichier, statut_traitement)
|
||||
VALUES
|
||||
(%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
""", (
|
||||
condition['source_condition_id'],
|
||||
condition['source_patient_id'],
|
||||
condition['source_visit_id'],
|
||||
condition['code_diagnostic'],
|
||||
condition['systeme_codage'],
|
||||
condition['date_diagnostic'],
|
||||
condition['source_fichier'],
|
||||
condition['statut_traitement']
|
||||
))
|
||||
|
||||
# Load drugs
|
||||
print(f"Loading {len(drugs)} drug prescriptions...")
|
||||
for drug in drugs:
|
||||
cursor.execute("""
|
||||
INSERT INTO staging.raw_drugs
|
||||
(source_drug_id, source_patient_id, source_visit_id,
|
||||
code_medicament, systeme_codage, date_debut, date_fin,
|
||||
quantite, source_fichier, statut_traitement)
|
||||
VALUES
|
||||
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
""", (
|
||||
drug['source_drug_id'],
|
||||
drug['source_patient_id'],
|
||||
drug['source_visit_id'],
|
||||
drug['code_medicament'],
|
||||
drug['systeme_codage'],
|
||||
drug['date_debut'],
|
||||
drug['date_fin'],
|
||||
drug['quantite'],
|
||||
drug['source_fichier'],
|
||||
drug['statut_traitement']
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
print("✓ All sample data loaded successfully!")
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*60)
|
||||
print("SAMPLE DATA GENERATION SUMMARY")
|
||||
print("="*60)
|
||||
print(f"Patients: {len(patients)}")
|
||||
print(f"Visits: {len(visits)}")
|
||||
print(f"Conditions: {len(conditions)}")
|
||||
print(f"Drug prescriptions: {len(drugs)}")
|
||||
print("="*60)
|
||||
print("\nData loaded into staging tables with status 'pending'")
|
||||
print("Ready for ETL processing!")
|
||||
print("="*60)
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f"Error loading data: {str(e)}")
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function."""
|
||||
print("Generating sample healthcare data...")
|
||||
print("="*60)
|
||||
|
||||
# Configuration
|
||||
num_patients = 100
|
||||
visits_per_patient = 3
|
||||
|
||||
# Generate data
|
||||
print(f"Generating {num_patients} patients...")
|
||||
patients = generate_patients(num_patients)
|
||||
|
||||
print(f"Generating visits (avg {visits_per_patient} per patient)...")
|
||||
visits = generate_visits(patients, visits_per_patient)
|
||||
|
||||
print("Generating conditions/diagnoses...")
|
||||
conditions = generate_conditions(visits)
|
||||
|
||||
print("Generating drug prescriptions...")
|
||||
drugs = generate_drugs(visits)
|
||||
|
||||
print("\nData generation complete!")
|
||||
print(f" - {len(patients)} patients")
|
||||
print(f" - {len(visits)} visits")
|
||||
print(f" - {len(conditions)} conditions")
|
||||
print(f" - {len(drugs)} drug prescriptions")
|
||||
|
||||
# Load data
|
||||
print("\nConnecting to database and loading data...")
|
||||
load_data_to_staging(patients, visits, conditions, drugs)
|
||||
|
||||
print("\n✓ Sample data generation complete!")
|
||||
print("\nNext steps:")
|
||||
print(" 1. Run ETL pipeline: omop-pipeline etl run --source staging.raw_patients --target person")
|
||||
print(" 2. Check results: omop-pipeline stats show")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
80
omop/scripts/load_sample_data.sh
Executable file
80
omop/scripts/load_sample_data.sh
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# Load Sample Data Script
|
||||
# This script sets up the database and loads sample data for testing
|
||||
|
||||
set -e
|
||||
|
||||
echo "=========================================="
|
||||
echo "OMOP Sample Data Loading Script"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# Colors for output
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
RED='\033[0;31m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Check if we're in the right directory
|
||||
if [ ! -f "setup.py" ]; then
|
||||
echo -e "${RED}Error: Must be run from omop directory${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Step 1: Install dependencies
|
||||
echo -e "${YELLOW}Step 1: Installing dependencies...${NC}"
|
||||
pip install faker > /dev/null 2>&1 || echo "Faker already installed"
|
||||
echo -e "${GREEN}✓ Dependencies installed${NC}"
|
||||
echo ""
|
||||
|
||||
# Step 2: Create database schemas
|
||||
echo -e "${YELLOW}Step 2: Creating database schemas...${NC}"
|
||||
python -m src.cli.commands schema create --type all 2>/dev/null || echo "Schemas may already exist"
|
||||
echo -e "${GREEN}✓ Schemas ready${NC}"
|
||||
echo ""
|
||||
|
||||
# Step 3: Generate and load sample data
|
||||
echo -e "${YELLOW}Step 3: Generating and loading sample data...${NC}"
|
||||
python scripts/generate_sample_data.py
|
||||
echo -e "${GREEN}✓ Sample data loaded${NC}"
|
||||
echo ""
|
||||
|
||||
# Step 4: Verify data
|
||||
echo -e "${YELLOW}Step 4: Verifying loaded data...${NC}"
|
||||
python -c "
|
||||
from src.utils.config import Config
|
||||
from src.utils.db_connection import DatabaseConnection
|
||||
from sqlalchemy import text
|
||||
|
||||
config = Config.load('config.yaml')
|
||||
db = DatabaseConnection(config)
|
||||
|
||||
with db.get_session() as session:
|
||||
# Count records in staging tables
|
||||
tables = ['raw_patients', 'raw_visits', 'raw_conditions', 'raw_drugs']
|
||||
|
||||
print('\nStaging Table Counts:')
|
||||
print('-' * 40)
|
||||
for table in tables:
|
||||
query = text(f'SELECT COUNT(*) FROM staging.{table}')
|
||||
count = session.execute(query).fetchone()[0]
|
||||
print(f' staging.{table:20s}: {count:5d} records')
|
||||
print('-' * 40)
|
||||
"
|
||||
echo -e "${GREEN}✓ Data verification complete${NC}"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo -e "${GREEN}Sample data loading complete!${NC}"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Run ETL pipeline:"
|
||||
echo " omop-pipeline etl run --source staging.raw_patients --target person"
|
||||
echo ""
|
||||
echo " 2. View statistics:"
|
||||
echo " omop-pipeline stats show"
|
||||
echo ""
|
||||
echo " 3. Validate data:"
|
||||
echo " omop-pipeline validate"
|
||||
echo ""
|
||||
106
omop/scripts/load_vocabularies.sh
Executable file
106
omop/scripts/load_vocabularies.sh
Executable file
@@ -0,0 +1,106 @@
|
||||
#!/bin/bash
|
||||
# Vocabulary Loading Script for OMOP Data Pipeline
|
||||
# This script downloads and loads OMOP vocabularies
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Configuration
|
||||
VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
|
||||
ATHENA_URL="https://athena.ohdsi.org/"
|
||||
|
||||
echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
|
||||
echo "================================"
|
||||
echo "Vocabulary directory: $VOCAB_DIR"
|
||||
echo "================================"
|
||||
echo ""
|
||||
|
||||
# Check if vocabulary directory exists
|
||||
if [ ! -d "$VOCAB_DIR" ]; then
|
||||
echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
|
||||
echo ""
|
||||
echo "To download OMOP vocabularies:"
|
||||
echo "1. Visit $ATHENA_URL"
|
||||
echo "2. Select the vocabularies you need"
|
||||
echo "3. Download the vocabulary bundle"
|
||||
echo "4. Extract to $VOCAB_DIR"
|
||||
echo ""
|
||||
echo "Required vocabularies for basic functionality:"
|
||||
echo " - SNOMED"
|
||||
echo " - ICD10CM"
|
||||
echo " - RxNorm"
|
||||
echo " - LOINC"
|
||||
echo " - CPT4"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check for required vocabulary files
|
||||
echo -e "${YELLOW}Checking vocabulary files...${NC}"
|
||||
REQUIRED_FILES=(
|
||||
"CONCEPT.csv"
|
||||
"VOCABULARY.csv"
|
||||
"DOMAIN.csv"
|
||||
"CONCEPT_CLASS.csv"
|
||||
"CONCEPT_RELATIONSHIP.csv"
|
||||
"RELATIONSHIP.csv"
|
||||
)
|
||||
|
||||
MISSING_FILES=()
|
||||
for file in "${REQUIRED_FILES[@]}"; do
|
||||
if [ ! -f "$VOCAB_DIR/$file" ]; then
|
||||
MISSING_FILES+=("$file")
|
||||
fi
|
||||
done
|
||||
|
||||
if [ ${#MISSING_FILES[@]} -gt 0 ]; then
|
||||
echo -e "${RED}Error: Missing required vocabulary files:${NC}"
|
||||
for file in "${MISSING_FILES[@]}"; do
|
||||
echo " - $file"
|
||||
done
|
||||
echo ""
|
||||
echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓ All required vocabulary files found${NC}"
|
||||
echo ""
|
||||
|
||||
# Count records in vocabulary files
|
||||
echo -e "${YELLOW}Vocabulary file statistics:${NC}"
|
||||
for file in "${REQUIRED_FILES[@]}"; do
|
||||
if [ -f "$VOCAB_DIR/$file" ]; then
|
||||
count=$(wc -l < "$VOCAB_DIR/$file")
|
||||
echo " $file: $((count - 1)) records"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Load vocabularies using Python CLI
|
||||
echo -e "${YELLOW}Loading vocabularies into database...${NC}"
|
||||
echo "This may take several minutes depending on vocabulary size..."
|
||||
echo ""
|
||||
|
||||
if command -v omop-pipeline &> /dev/null; then
|
||||
omop-pipeline vocab load --path "$VOCAB_DIR"
|
||||
echo ""
|
||||
echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
|
||||
else
|
||||
echo -e "${RED}Error: omop-pipeline command not found${NC}"
|
||||
echo "Please install the package with: pip install -e ."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}================================${NC}"
|
||||
echo -e "${GREEN}Vocabulary loading completed!${NC}"
|
||||
echo -e "${GREEN}================================${NC}"
|
||||
echo ""
|
||||
echo "You can now run the ETL pipeline:"
|
||||
echo " omop-pipeline etl run --source staging.raw_patients --target person"
|
||||
echo ""
|
||||
73
omop/scripts/run_tests.sh
Executable file
73
omop/scripts/run_tests.sh
Executable file
@@ -0,0 +1,73 @@
|
||||
#!/bin/bash
|
||||
# Test Execution Script for OMOP Data Pipeline
|
||||
# This script runs all tests with coverage reporting
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${GREEN}OMOP Pipeline Test Suite${NC}"
|
||||
echo "================================"
|
||||
echo ""
|
||||
|
||||
# Check if pytest is installed
|
||||
if ! command -v pytest &> /dev/null; then
|
||||
echo -e "${RED}Error: pytest not found${NC}"
|
||||
echo "Please install test dependencies:"
|
||||
echo " pip install -e .[test]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run tests with coverage
|
||||
echo -e "${YELLOW}Running tests with coverage...${NC}"
|
||||
echo ""
|
||||
|
||||
pytest \
|
||||
--verbose \
|
||||
--cov=src \
|
||||
--cov-report=html \
|
||||
--cov-report=term \
|
||||
--cov-report=xml \
|
||||
tests/
|
||||
|
||||
TEST_EXIT_CODE=$?
|
||||
|
||||
echo ""
|
||||
if [ $TEST_EXIT_CODE -eq 0 ]; then
|
||||
echo -e "${GREEN}================================${NC}"
|
||||
echo -e "${GREEN}All tests passed!${NC}"
|
||||
echo -e "${GREEN}================================${NC}"
|
||||
echo ""
|
||||
echo "Coverage report generated:"
|
||||
echo " HTML: htmlcov/index.html"
|
||||
echo " XML: coverage.xml"
|
||||
echo ""
|
||||
else
|
||||
echo -e "${RED}================================${NC}"
|
||||
echo -e "${RED}Some tests failed${NC}"
|
||||
echo -e "${RED}================================${NC}"
|
||||
echo ""
|
||||
exit $TEST_EXIT_CODE
|
||||
fi
|
||||
|
||||
# Optional: Run linting
|
||||
if command -v flake8 &> /dev/null; then
|
||||
echo -e "${YELLOW}Running code quality checks...${NC}"
|
||||
flake8 src/ --max-line-length=100 --exclude=__pycache__,*.pyc
|
||||
echo -e "${GREEN}✓ Code quality checks passed${NC}"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Optional: Run type checking
|
||||
if command -v mypy &> /dev/null; then
|
||||
echo -e "${YELLOW}Running type checks...${NC}"
|
||||
mypy src/ --ignore-missing-imports
|
||||
echo -e "${GREEN}✓ Type checks passed${NC}"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}Test suite completed successfully!${NC}"
|
||||
91
omop/scripts/setup_database.sh
Executable file
91
omop/scripts/setup_database.sh
Executable file
@@ -0,0 +1,91 @@
|
||||
#!/bin/bash
|
||||
# Database Setup Script for OMOP Data Pipeline
|
||||
# This script creates the database and schemas for the OMOP pipeline
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Configuration (can be overridden by environment variables)
|
||||
DB_HOST="${DB_HOST:-localhost}"
|
||||
DB_PORT="${DB_PORT:-5432}"
|
||||
DB_NAME="${DB_NAME:-omop_db}"
|
||||
DB_USER="${DB_USER:-postgres}"
|
||||
DB_PASSWORD="${DB_PASSWORD:-}"
|
||||
ADMIN_USER="${ADMIN_USER:-postgres}"
|
||||
|
||||
echo -e "${GREEN}OMOP Database Setup${NC}"
|
||||
echo "================================"
|
||||
echo "Host: $DB_HOST"
|
||||
echo "Port: $DB_PORT"
|
||||
echo "Database: $DB_NAME"
|
||||
echo "User: $DB_USER"
|
||||
echo "================================"
|
||||
echo ""
|
||||
|
||||
# Check if PostgreSQL is running
|
||||
echo -e "${YELLOW}Checking PostgreSQL connection...${NC}"
|
||||
if ! pg_isready -h "$DB_HOST" -p "$DB_PORT" > /dev/null 2>&1; then
|
||||
echo -e "${RED}Error: Cannot connect to PostgreSQL at $DB_HOST:$DB_PORT${NC}"
|
||||
echo "Please ensure PostgreSQL is running and accessible."
|
||||
exit 1
|
||||
fi
|
||||
echo -e "${GREEN}✓ PostgreSQL is running${NC}"
|
||||
echo ""
|
||||
|
||||
# Create database if it doesn't exist
|
||||
echo -e "${YELLOW}Creating database...${NC}"
|
||||
if PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -lqt | cut -d \| -f 1 | grep -qw "$DB_NAME"; then
|
||||
echo -e "${YELLOW}Database $DB_NAME already exists${NC}"
|
||||
else
|
||||
PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -c "CREATE DATABASE $DB_NAME;"
|
||||
echo -e "${GREEN}✓ Database $DB_NAME created${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Create user if it doesn't exist
|
||||
echo -e "${YELLOW}Creating database user...${NC}"
|
||||
if PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" -tAc "SELECT 1 FROM pg_roles WHERE rolname='$DB_USER'" | grep -q 1; then
|
||||
echo -e "${YELLOW}User $DB_USER already exists${NC}"
|
||||
else
|
||||
PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" -c "CREATE USER $DB_USER WITH PASSWORD '$DB_PASSWORD';"
|
||||
echo -e "${GREEN}✓ User $DB_USER created${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Grant privileges
|
||||
echo -e "${YELLOW}Granting privileges...${NC}"
|
||||
PGPASSWORD="$DB_PASSWORD" psql -h "$DB_HOST" -p "$DB_PORT" -U "$ADMIN_USER" -d "$DB_NAME" <<EOF
|
||||
GRANT ALL PRIVILEGES ON DATABASE $DB_NAME TO $DB_USER;
|
||||
GRANT ALL ON SCHEMA public TO $DB_USER;
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO $DB_USER;
|
||||
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO $DB_USER;
|
||||
EOF
|
||||
echo -e "${GREEN}✓ Privileges granted${NC}"
|
||||
echo ""
|
||||
|
||||
# Create schemas using the Python CLI
|
||||
echo -e "${YELLOW}Creating OMOP schemas...${NC}"
|
||||
if command -v omop-pipeline &> /dev/null; then
|
||||
omop-pipeline schema create --type all
|
||||
echo -e "${GREEN}✓ OMOP schemas created${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}Warning: omop-pipeline command not found${NC}"
|
||||
echo "Please install the package with: pip install -e ."
|
||||
echo "Then run: omop-pipeline schema create --type all"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo -e "${GREEN}================================${NC}"
|
||||
echo -e "${GREEN}Database setup completed!${NC}"
|
||||
echo -e "${GREEN}================================${NC}"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Load vocabularies: omop-pipeline vocab load --path /path/to/vocabularies"
|
||||
echo "2. Load staging data into staging tables"
|
||||
echo "3. Run ETL: omop-pipeline etl run --source staging.raw_patients --target person"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user