Initial commit

2026-03-05 01:20:15 +01:00
commit c0c50e56f0
364 changed files with 62207 additions and 0 deletions
--- a/omop/scripts/load_vocabularies.sh
+++ b/omop/scripts/load_vocabularies.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Vocabulary Loading Script for OMOP Data Pipeline
+# This script downloads and loads OMOP vocabularies
+
+set -e  # Exit on error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
+ATHENA_URL="https://athena.ohdsi.org/"
+
+echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
+echo "================================"
+echo "Vocabulary directory: $VOCAB_DIR"
+echo "================================"
+echo ""
+
+# Check if vocabulary directory exists
+if [ ! -d "$VOCAB_DIR" ]; then
+    echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
+    echo ""
+    echo "To download OMOP vocabularies:"
+    echo "1. Visit $ATHENA_URL"
+    echo "2. Select the vocabularies you need"
+    echo "3. Download the vocabulary bundle"
+    echo "4. Extract to $VOCAB_DIR"
+    echo ""
+    echo "Required vocabularies for basic functionality:"
+    echo "  - SNOMED"
+    echo "  - ICD10CM"
+    echo "  - RxNorm"
+    echo "  - LOINC"
+    echo "  - CPT4"
+    echo ""
+    exit 1
+fi
+
+# Check for required vocabulary files
+echo -e "${YELLOW}Checking vocabulary files...${NC}"
+REQUIRED_FILES=(
+    "CONCEPT.csv"
+    "VOCABULARY.csv"
+    "DOMAIN.csv"
+    "CONCEPT_CLASS.csv"
+    "CONCEPT_RELATIONSHIP.csv"
+    "RELATIONSHIP.csv"
+)
+
+MISSING_FILES=()
+for file in "${REQUIRED_FILES[@]}"; do
+    if [ ! -f "$VOCAB_DIR/$file" ]; then
+        MISSING_FILES+=("$file")
+    fi
+done
+
+if [ ${#MISSING_FILES[@]} -gt 0 ]; then
+    echo -e "${RED}Error: Missing required vocabulary files:${NC}"
+    for file in "${MISSING_FILES[@]}"; do
+        echo "  - $file"
+    done
+    echo ""
+    echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ All required vocabulary files found${NC}"
+echo ""
+
+# Count records in vocabulary files
+echo -e "${YELLOW}Vocabulary file statistics:${NC}"
+for file in "${REQUIRED_FILES[@]}"; do
+    if [ -f "$VOCAB_DIR/$file" ]; then
+        count=$(wc -l < "$VOCAB_DIR/$file")
+        echo "  $file: $((count - 1)) records"
+    fi
+done
+echo ""
+
+# Load vocabularies using Python CLI
+echo -e "${YELLOW}Loading vocabularies into database...${NC}"
+echo "This may take several minutes depending on vocabulary size..."
+echo ""
+
+if command -v omop-pipeline &> /dev/null; then
+    omop-pipeline vocab load --path "$VOCAB_DIR"
+    echo ""
+    echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
+else
+    echo -e "${RED}Error: omop-pipeline command not found${NC}"
+    echo "Please install the package with: pip install -e ."
+    exit 1
+fi
+
+echo ""
+echo -e "${GREEN}================================${NC}"
+echo -e "${GREEN}Vocabulary loading completed!${NC}"
+echo -e "${GREEN}================================${NC}"
+echo ""
+echo "You can now run the ETL pipeline:"
+echo "  omop-pipeline etl run --source staging.raw_patients --target person"
+echo ""