aivanov_database/omop/scripts/load_vocabularies.sh

#!/bin/bash
# Vocabulary Loading Script for OMOP Data Pipeline
# This script downloads and loads OMOP vocabularies

set -e  # Exit on error

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

# Configuration
VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
ATHENA_URL="https://athena.ohdsi.org/"

echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
echo "================================"
echo "Vocabulary directory: $VOCAB_DIR"
echo "================================"
echo ""

# Check if vocabulary directory exists
if [ ! -d "$VOCAB_DIR" ]; then
    echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
    echo ""
    echo "To download OMOP vocabularies:"
    echo "1. Visit $ATHENA_URL"
    echo "2. Select the vocabularies you need"
    echo "3. Download the vocabulary bundle"
    echo "4. Extract to $VOCAB_DIR"
    echo ""
    echo "Required vocabularies for basic functionality:"
    echo "  - SNOMED"
    echo "  - ICD10CM"
    echo "  - RxNorm"
    echo "  - LOINC"
    echo "  - CPT4"
    echo ""
    exit 1
fi

# Check for required vocabulary files
echo -e "${YELLOW}Checking vocabulary files...${NC}"
REQUIRED_FILES=(
    "CONCEPT.csv"
    "VOCABULARY.csv"
    "DOMAIN.csv"
    "CONCEPT_CLASS.csv"
    "CONCEPT_RELATIONSHIP.csv"
    "RELATIONSHIP.csv"
)

MISSING_FILES=()
for file in "${REQUIRED_FILES[@]}"; do
    if [ ! -f "$VOCAB_DIR/$file" ]; then
        MISSING_FILES+=("$file")
    fi
done

if [ ${#MISSING_FILES[@]} -gt 0 ]; then
    echo -e "${RED}Error: Missing required vocabulary files:${NC}"
    for file in "${MISSING_FILES[@]}"; do
        echo "  - $file"
    done
    echo ""
    echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
    exit 1
fi

echo -e "${GREEN}✓ All required vocabulary files found${NC}"
echo ""

# Count records in vocabulary files
echo -e "${YELLOW}Vocabulary file statistics:${NC}"
for file in "${REQUIRED_FILES[@]}"; do
    if [ -f "$VOCAB_DIR/$file" ]; then
        count=$(wc -l < "$VOCAB_DIR/$file")
        echo "  $file: $((count - 1)) records"
    fi
done
echo ""

# Load vocabularies using Python CLI
echo -e "${YELLOW}Loading vocabularies into database...${NC}"
echo "This may take several minutes depending on vocabulary size..."
echo ""

if command -v omop-pipeline &> /dev/null; then
    omop-pipeline vocab load --path "$VOCAB_DIR"
    echo ""
    echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
else
    echo -e "${RED}Error: omop-pipeline command not found${NC}"
    echo "Please install the package with: pip install -e ."
    exit 1
fi

echo ""
echo -e "${GREEN}================================${NC}"
echo -e "${GREEN}Vocabulary loading completed!${NC}"
echo -e "${GREEN}================================${NC}"
echo ""
echo "You can now run the ETL pipeline:"
echo "  omop-pipeline etl run --source staging.raw_patients --target person"
echo ""