#!/bin/bash
# split_fixed.sh - Versión con awk corregido

INPUT_FILE="$1"
OUTPUT_PREFIX="normativa_"
INDEX_FILE="indice_normativa.json"
LINES_PER_FILE=50000

echo "🚀 Procesando archivo: $INPUT_FILE"
echo "📊 Creando índice y dividiendo archivo..."

# 1. CONTAR LÍNEAS
echo "📈 Contando líneas..."
TOTAL_LINES=$(wc -l < "$INPUT_FILE")
TOTAL_RECORDS=$((TOTAL_LINES - 1))
echo "📊 Total de registros: $TOTAL_RECORDS"

# 2. CALCULAR PARTES
TOTAL_PARTS=$(( (TOTAL_RECORDS + LINES_PER_FILE - 1) / LINES_PER_FILE ))
echo "📦 Partes estimadas: $TOTAL_PARTS"

# 3. PROCESAR CON AWK (VERSIÓN CORREGIDA)
echo "📋 Creando índice con awk..."
start_time=$(date +%s)

# Primero, extraer header
HEADER=$(head -1 "$INPUT_FILE")

# Usar awk con mejor manejo de JSON
awk -v total_parts="$TOTAL_PARTS" -v lines_per_file="$LINES_PER_FILE" -v header="$HEADER" '
BEGIN {
    FS = ",";
    OFS = ",";
    print "[";
    first_record = 1;
    current_part = 1;
    records_in_current_part = 0;
    
    # Crear archivo para la primera parte
    part_file = sprintf("temp_part_%d.csv", current_part);
    print header > part_file;
}

NR == 1 {
    next;  # Saltar header ya que lo manejamos en BEGIN
}

{
    records_in_current_part++;
    
    # Si excedimos el límite, crear nueva parte
    if (records_in_current_part > lines_per_file) {
        current_part++;
        records_in_current_part = 1;
        part_file = sprintf("temp_part_%d.csv", current_part);
        print header > part_file;
    }
    
    # Extraer campos básicos
    id = $1;
    gsub(/^"|"$/, "", id);
    gsub(/"/, "\\\"", id);
    
    tipo = $2;
    gsub(/^"|"$/, "", tipo);
    gsub(/"/, "\\\"", tipo);
    
    numero = $3;
    gsub(/^"|"$/, "", numero);
    gsub(/"/, "\\\"", numero);
    
    # Extraer título (campo 10, pero cuidado con campos vacíos)
    titulo = "";
    if (NF >= 10) {
        titulo = $10;
        gsub(/^"|"$/, "", titulo);
        # Escapar caracteres especiales para JSON
        gsub(/\\/, "\\\\", titulo);
        gsub(/"/, "\\\"", titulo);
        gsub(/\//, "\\/", titulo);
        gsub(/\n/, "\\n", titulo);
        gsub(/\r/, "\\r", titulo);
        gsub(/\t/, "\\t", titulo);
        gsub(/\f/, "\\f", titulo);
        gsub(/\b/, "\\b", titulo);
    }
    
    # Agregar al índice JSON
    if (first_record == 0) {
        printf ",\n";
    }
    printf "{\"i\":\"%s\",\"t\":\"%s\",\"n\":\"%s\",\"r\":\"%s\",\"p\":%d}", 
           id, tipo, numero, titulo, current_part;
    first_record = 0;
    
    # Guardar línea en archivo de parte
    print $0 >> part_file;
    
    # Mostrar progreso
    if (NR % 50000 == 0) {
        printf "📊 Procesadas %d líneas...\r", NR > "/dev/stderr";
    }
}

END {
    print "\n]";
    printf "\n✅ Procesados %d registros en %d partes\n", NR, current_part > "/dev/stderr";
}
' "$INPUT_FILE" > "$INDEX_FILE"

end_time=$(date +%s)
echo "⏱️  Índice creado en $((end_time - start_time)) segundos"

# 4. COMPRIMIR ARCHIVOS (EN PARALELO)
echo "📦 Comprimiendo archivos..."

# Contar archivos temporales creados
TEMP_COUNT=$(ls temp_part_*.csv 2>/dev/null | wc -l)
echo "📁 Archivos temporales: $TEMP_COUNT"

# Función para comprimir
compress_part() {
    local part=$1
    local temp_file="temp_part_${part}.csv"
    local gz_file="${OUTPUT_PREFIX}${part}.csv.gz"
    
    if [ -f "$temp_file" ]; then
        echo "🗜️  Comprimiendo parte $part..."
        gzip -c "$temp_file" > "$gz_file"
        rm "$temp_file"
    fi
}

# Exportar función para usar con xargs
export -f compress_part
export OUTPUT_PREFIX

# Usar paralelización si hay muchos archivos
if [ "$TEMP_COUNT" -gt 5 ]; then
    echo "⚡ Usando procesamiento paralelo..."
    seq 1 "$TEMP_COUNT" | xargs -P 4 -I {} bash -c 'compress_part "$@"' _ {}
else
    # Serial si son pocos
    for i in $(seq 1 "$TEMP_COUNT"); do
        compress_part "$i"
    done
fi

# 5. CREAR ÍNDICE MINIFICADO
echo "📄 Creando índice minificado..."
if command -v jq >/dev/null 2>&1; then
    jq -c '.' "$INDEX_FILE" > "${INDEX_FILE}.min.json"
    echo "✅ Índice minificado creado con jq"
else
    # Usar Python como alternativa
    python3 -c "
import json, sys
try:
    with open('$INDEX_FILE', 'r', encoding='utf-8') as f:
        data = json.load(f)
    with open('${INDEX_FILE}.min.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, separators=(',', ':'))
    print('✅ Índice minificado creado con Python')
except Exception as e:
    print(f'❌ Error: {e}')
    # Crear copia simple
    import shutil
    shutil.copy2('$INDEX_FILE', '${INDEX_FILE}.min.json')
" 2>/dev/null || cp "$INDEX_FILE" "${INDEX_FILE}.min.json"
fi

# 6. COMPRIMIR ÍNDICE
echo "🗜️  Comprimiendo índice..."
gzip -k -f "$INDEX_FILE"

# 7. VERIFICAR
echo "🔍 Verificando resultados..."
echo ""
echo "========================================="
echo "📊 RESUMEN FINAL"
echo "========================================="

# Mostrar archivos creados
echo "📁 ARCHIVOS DE DATOS:"
ls -la ${OUTPUT_PREFIX}*.csv.gz 2>/dev/null | awk '{printf "  %s (%s)\n", $9, $5}' | sort -V

echo ""
echo "📄 ARCHIVOS DE ÍNDICE:"
for file in "$INDEX_FILE" "$INDEX_FILE.gz" "${INDEX_FILE}.min.json"; do
    if [ -f "$file" ]; then
        size=$(du -h "$file" | cut -f1)
        echo "  - $file ($size)"
    fi
done

echo ""
echo "📈 ESTADÍSTICAS:"
echo "  - Registros totales: $TOTAL_RECORDS"
echo "  - Partes creadas: $TEMP_COUNT"
echo "  - Tiempo total: $((end_time - start_time)) segundos"

# Calcular tamaño total de datos
DATA_SIZE=$(du -ch ${OUTPUT_PREFIX}*.csv.gz 2>/dev/null | tail -1 | cut -f1)
echo "  - Tamaño total datos: ${DATA_SIZE:-0}"

echo "========================================="
echo "🎉 ¡PROCESO COMPLETADO!"
echo "========================================="