107 lines
2.8 KiB
Bash
Executable File
107 lines
2.8 KiB
Bash
Executable File
#!/bin/bash
|
|
# Vocabulary Loading Script for OMOP Data Pipeline
|
|
# This script downloads and loads OMOP vocabularies
|
|
|
|
set -e # Exit on error
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Configuration
|
|
VOCAB_DIR="${VOCAB_DIR:-./vocabularies}"
|
|
ATHENA_URL="https://athena.ohdsi.org/"
|
|
|
|
echo -e "${GREEN}OMOP Vocabulary Loader${NC}"
|
|
echo "================================"
|
|
echo "Vocabulary directory: $VOCAB_DIR"
|
|
echo "================================"
|
|
echo ""
|
|
|
|
# Check if vocabulary directory exists
|
|
if [ ! -d "$VOCAB_DIR" ]; then
|
|
echo -e "${YELLOW}Vocabulary directory not found: $VOCAB_DIR${NC}"
|
|
echo ""
|
|
echo "To download OMOP vocabularies:"
|
|
echo "1. Visit $ATHENA_URL"
|
|
echo "2. Select the vocabularies you need"
|
|
echo "3. Download the vocabulary bundle"
|
|
echo "4. Extract to $VOCAB_DIR"
|
|
echo ""
|
|
echo "Required vocabularies for basic functionality:"
|
|
echo " - SNOMED"
|
|
echo " - ICD10CM"
|
|
echo " - RxNorm"
|
|
echo " - LOINC"
|
|
echo " - CPT4"
|
|
echo ""
|
|
exit 1
|
|
fi
|
|
|
|
# Check for required vocabulary files
|
|
echo -e "${YELLOW}Checking vocabulary files...${NC}"
|
|
REQUIRED_FILES=(
|
|
"CONCEPT.csv"
|
|
"VOCABULARY.csv"
|
|
"DOMAIN.csv"
|
|
"CONCEPT_CLASS.csv"
|
|
"CONCEPT_RELATIONSHIP.csv"
|
|
"RELATIONSHIP.csv"
|
|
)
|
|
|
|
MISSING_FILES=()
|
|
for file in "${REQUIRED_FILES[@]}"; do
|
|
if [ ! -f "$VOCAB_DIR/$file" ]; then
|
|
MISSING_FILES+=("$file")
|
|
fi
|
|
done
|
|
|
|
if [ ${#MISSING_FILES[@]} -gt 0 ]; then
|
|
echo -e "${RED}Error: Missing required vocabulary files:${NC}"
|
|
for file in "${MISSING_FILES[@]}"; do
|
|
echo " - $file"
|
|
done
|
|
echo ""
|
|
echo "Please ensure all vocabulary files are extracted to $VOCAB_DIR"
|
|
exit 1
|
|
fi
|
|
|
|
echo -e "${GREEN}✓ All required vocabulary files found${NC}"
|
|
echo ""
|
|
|
|
# Count records in vocabulary files
|
|
echo -e "${YELLOW}Vocabulary file statistics:${NC}"
|
|
for file in "${REQUIRED_FILES[@]}"; do
|
|
if [ -f "$VOCAB_DIR/$file" ]; then
|
|
count=$(wc -l < "$VOCAB_DIR/$file")
|
|
echo " $file: $((count - 1)) records"
|
|
fi
|
|
done
|
|
echo ""
|
|
|
|
# Load vocabularies using Python CLI
|
|
echo -e "${YELLOW}Loading vocabularies into database...${NC}"
|
|
echo "This may take several minutes depending on vocabulary size..."
|
|
echo ""
|
|
|
|
if command -v omop-pipeline &> /dev/null; then
|
|
omop-pipeline vocab load --path "$VOCAB_DIR"
|
|
echo ""
|
|
echo -e "${GREEN}✓ Vocabularies loaded successfully${NC}"
|
|
else
|
|
echo -e "${RED}Error: omop-pipeline command not found${NC}"
|
|
echo "Please install the package with: pip install -e ."
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
echo -e "${GREEN}================================${NC}"
|
|
echo -e "${GREEN}Vocabulary loading completed!${NC}"
|
|
echo -e "${GREEN}================================${NC}"
|
|
echo ""
|
|
echo "You can now run the ETL pipeline:"
|
|
echo " omop-pipeline etl run --source staging.raw_patients --target person"
|
|
echo ""
|