#!/usr/bin/env python3 """ Generate Sample Data for OMOP Pipeline Testing This script generates fictional healthcare data and loads it into staging tables. It creates realistic but completely fake patient, visit, condition, and drug data. """ import sys import os from pathlib import Path from datetime import datetime, timedelta import random from faker import Faker from sqlalchemy import text import psycopg2 # Database configuration DB_CONFIG = { 'host': 'localhost', 'port': 5432, 'database': 'omop_cdm', 'user': 'dom', 'password': 'loli' } # Initialize Faker for generating fake data fake = Faker('fr_FR') # French locale Faker.seed(42) # For reproducibility random.seed(42) # Sample medical codes ICD10_CODES = [ ('E11.9', 'Diabète de type 2 sans complication'), ('I10', 'Hypertension essentielle'), ('J45.9', 'Asthme non précisé'), ('M79.3', 'Panniculite non précisée'), ('K21.9', 'Reflux gastro-oesophagien sans oesophagite'), ] ATC_CODES = [ ('A10BA02', 'Metformine'), ('C09AA02', 'Enalapril'), ('R03AC02', 'Salbutamol'), ('A02BC01', 'Oméprazole'), ('N02BE01', 'Paracétamol'), ] VISIT_TYPES = [ ('consultation', 'Consultation externe'), ('urgence', 'Urgence'), ('hospitalisation', 'Hospitalisation'), ] def generate_patients(num_patients=100): """Generate fake patient data.""" patients = [] for i in range(num_patients): birth_date = fake.date_of_birth(minimum_age=18, maximum_age=90) patient = { 'source_patient_id': f'PAT{i+1:05d}', 'date_naissance': birth_date, 'sexe': random.choice(['M', 'F']), 'code_postal': fake.postcode(), 'source_fichier': 'sample_data_generation', 'statut_traitement': 'pending' } patients.append(patient) return patients def generate_visits(patients, visits_per_patient=3): """Generate fake visit data.""" visits = [] visit_id = 1 for patient in patients: num_visits = random.randint(1, visits_per_patient) for _ in range(num_visits): visit_type, visit_desc = random.choice(VISIT_TYPES) # Generate visit dates (within last 2 years) days_ago = random.randint(1, 730) visit_start = datetime.now() - timedelta(days=days_ago) # Visit duration if visit_type == 'hospitalisation': duration = random.randint(1, 14) elif visit_type == 'urgence': duration = random.randint(0, 1) else: duration = 0 visit_end = visit_start + timedelta(days=duration) visit = { 'source_visit_id': f'VIS{visit_id:06d}', 'source_patient_id': patient['source_patient_id'], 'type_visite': visit_type, 'date_debut': visit_start, 'date_fin': visit_end, 'source_fichier': 'sample_data_generation', 'statut_traitement': 'pending' } visits.append(visit) visit_id += 1 return visits def generate_conditions(visits): """Generate fake condition/diagnosis data.""" conditions = [] condition_id = 1 for visit in visits: # 70% chance of having a condition if random.random() < 0.7: num_conditions = random.randint(1, 2) for _ in range(num_conditions): code, description = random.choice(ICD10_CODES) condition = { 'source_condition_id': f'COND{condition_id:06d}', 'source_patient_id': visit['source_patient_id'], 'source_visit_id': visit['source_visit_id'], 'code_diagnostic': code, 'systeme_codage': 'ICD10', 'date_diagnostic': visit['date_debut'].date(), 'source_fichier': 'sample_data_generation', 'statut_traitement': 'pending' } conditions.append(condition) condition_id += 1 return conditions def generate_drugs(visits): """Generate fake drug prescription data.""" drugs = [] drug_id = 1 for visit in visits: # 60% chance of having a drug prescription if random.random() < 0.6: num_drugs = random.randint(1, 3) for _ in range(num_drugs): code, description = random.choice(ATC_CODES) drug_start = visit['date_debut'] duration = random.randint(7, 90) drug_end = drug_start + timedelta(days=duration) drug = { 'source_drug_id': f'DRUG{drug_id:06d}', 'source_patient_id': visit['source_patient_id'], 'source_visit_id': visit['source_visit_id'], 'code_medicament': code, 'systeme_codage': 'ATC', 'date_debut': drug_start.date(), 'date_fin': drug_end.date(), 'quantite': random.randint(1, 3), 'duree_traitement': duration, 'source_fichier': 'sample_data_generation', 'statut_traitement': 'pending' } drugs.append(drug) drug_id += 1 return drugs def load_data_to_staging(patients, visits, conditions, drugs): """Load generated data into staging tables.""" conn = psycopg2.connect(**DB_CONFIG) cursor = conn.cursor() try: # Load patients print(f"Loading {len(patients)} patients...") for patient in patients: cursor.execute(""" INSERT INTO staging.raw_patients (source_patient_id, date_naissance, sexe, code_postal, source_fichier, statut_traitement) VALUES (%s, %s, %s, %s, %s, %s) """, ( patient['source_patient_id'], patient['date_naissance'], patient['sexe'], patient['code_postal'], patient['source_fichier'], patient['statut_traitement'] )) # Load visits print(f"Loading {len(visits)} visits...") for visit in visits: cursor.execute(""" INSERT INTO staging.raw_visits (source_visit_id, source_patient_id, type_visite, date_debut, date_fin, source_fichier, statut_traitement) VALUES (%s, %s, %s, %s, %s, %s, %s) """, ( visit['source_visit_id'], visit['source_patient_id'], visit['type_visite'], visit['date_debut'], visit['date_fin'], visit['source_fichier'], visit['statut_traitement'] )) # Load conditions print(f"Loading {len(conditions)} conditions...") for condition in conditions: cursor.execute(""" INSERT INTO staging.raw_conditions (source_condition_id, source_patient_id, source_visit_id, code_diagnostic, systeme_codage, date_diagnostic, source_fichier, statut_traitement) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """, ( condition['source_condition_id'], condition['source_patient_id'], condition['source_visit_id'], condition['code_diagnostic'], condition['systeme_codage'], condition['date_diagnostic'], condition['source_fichier'], condition['statut_traitement'] )) # Load drugs print(f"Loading {len(drugs)} drug prescriptions...") for drug in drugs: cursor.execute(""" INSERT INTO staging.raw_drugs (source_drug_id, source_patient_id, source_visit_id, code_medicament, systeme_codage, date_debut, date_fin, quantite, source_fichier, statut_traitement) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, ( drug['source_drug_id'], drug['source_patient_id'], drug['source_visit_id'], drug['code_medicament'], drug['systeme_codage'], drug['date_debut'], drug['date_fin'], drug['quantite'], drug['source_fichier'], drug['statut_traitement'] )) conn.commit() print("✓ All sample data loaded successfully!") # Print summary print("\n" + "="*60) print("SAMPLE DATA GENERATION SUMMARY") print("="*60) print(f"Patients: {len(patients)}") print(f"Visits: {len(visits)}") print(f"Conditions: {len(conditions)}") print(f"Drug prescriptions: {len(drugs)}") print("="*60) print("\nData loaded into staging tables with status 'pending'") print("Ready for ETL processing!") print("="*60) except Exception as e: conn.rollback() print(f"Error loading data: {str(e)}") raise finally: cursor.close() conn.close() def main(): """Main function.""" print("Generating sample healthcare data...") print("="*60) # Configuration num_patients = 100 visits_per_patient = 3 # Generate data print(f"Generating {num_patients} patients...") patients = generate_patients(num_patients) print(f"Generating visits (avg {visits_per_patient} per patient)...") visits = generate_visits(patients, visits_per_patient) print("Generating conditions/diagnoses...") conditions = generate_conditions(visits) print("Generating drug prescriptions...") drugs = generate_drugs(visits) print("\nData generation complete!") print(f" - {len(patients)} patients") print(f" - {len(visits)} visits") print(f" - {len(conditions)} conditions") print(f" - {len(drugs)} drug prescriptions") # Load data print("\nConnecting to database and loading data...") load_data_to_staging(patients, visits, conditions, drugs) print("\n✓ Sample data generation complete!") print("\nNext steps:") print(" 1. Run ETL pipeline: omop-pipeline etl run --source staging.raw_patients --target person") print(" 2. Check results: omop-pipeline stats show") if __name__ == '__main__': main()