Files
aivanov_database/omop/scripts/load_vocabularies.sh
2026-03-05 01:20:15 +01:00

107 lines
2.8 KiB
Bash
Executable File

#!/bin/bash
# Vocabulary Loading Script for OMOP Data Pipeline
# This script downloads and loads OMOP vocabularies
set -e # Exit on error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
ATHENA_URL="https://athena.ohdsi.org/"
echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
echo "================================"
echo "Vocabulary directory: $VOCAB_DIR"
echo "================================"
echo ""
# Check if vocabulary directory exists
if [ ! -d "$VOCAB_DIR" ]; then
echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
echo ""
echo "To download OMOP vocabularies:"
echo "1. Visit $ATHENA_URL"
echo "2. Select the vocabularies you need"
echo "3. Download the vocabulary bundle"
echo "4. Extract to $VOCAB_DIR"
echo ""
echo "Required vocabularies for basic functionality:"
echo " - SNOMED"
echo " - ICD10CM"
echo " - RxNorm"
echo " - LOINC"
echo " - CPT4"
echo ""
exit 1
fi
# Check for required vocabulary files
echo -e "${YELLOW}Checking vocabulary files...${NC}"
REQUIRED_FILES=(
"CONCEPT.csv"
"VOCABULARY.csv"
"DOMAIN.csv"
"CONCEPT_CLASS.csv"
"CONCEPT_RELATIONSHIP.csv"
"RELATIONSHIP.csv"
)
MISSING_FILES=()
for file in "${REQUIRED_FILES[@]}"; do
if [ ! -f "$VOCAB_DIR/$file" ]; then
MISSING_FILES+=("$file")
fi
done
if [ ${#MISSING_FILES[@]} -gt 0 ]; then
echo -e "${RED}Error: Missing required vocabulary files:${NC}"
for file in "${MISSING_FILES[@]}"; do
echo " - $file"
done
echo ""
echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
exit 1
fi
echo -e "${GREEN}✓ All required vocabulary files found${NC}"
echo ""
# Count records in vocabulary files
echo -e "${YELLOW}Vocabulary file statistics:${NC}"
for file in "${REQUIRED_FILES[@]}"; do
if [ -f "$VOCAB_DIR/$file" ]; then
count=$(wc -l < "$VOCAB_DIR/$file")
echo " $file: $((count - 1)) records"
fi
done
echo ""
# Load vocabularies using Python CLI
echo -e "${YELLOW}Loading vocabularies into database...${NC}"
echo "This may take several minutes depending on vocabulary size..."
echo ""
if command -v omop-pipeline &> /dev/null; then
omop-pipeline vocab load --path "$VOCAB_DIR"
echo ""
echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
else
echo -e "${RED}Error: omop-pipeline command not found${NC}"
echo "Please install the package with: pip install -e ."
exit 1
fi
echo ""
echo -e "${GREEN}================================${NC}"
echo -e "${GREEN}Vocabulary loading completed!${NC}"
echo -e "${GREEN}================================${NC}"
echo ""
echo "You can now run the ETL pipeline:"
echo " omop-pipeline etl run --source staging.raw_patients --target person"
echo ""