SQL to Directus Seed

A robust Python utility that parses INSERT INTO statements from SQL dumps (PostgreSQL, MySQL) and converts them into Directus Seed JSON format for directus-sync. Handles complex types, nulls, casts, and automatically generates sync IDs.

Features

Smart SQL Parsing: Handles comments, multiline statements, quoted identifiers
Type Detection: Automatically converts strings, numbers, nulls, UUIDs, timestamps
PostgreSQL Casts: Handles ::uuid, ::timestamp type casts
Sync ID Generation: Auto-generates _sync_id based on primary key
Multi-table Support: Processes multiple tables with dependency ordering
Error Handling: Validates column/value count, logs parsing warnings

Use Case

Perfect for migrating data between environments or converting legacy SQL dumps to Directus-compatible seeds:

# Convert PostgreSQL dump to Directus seed
python sql_to_seed.py -i production_dump.sql -o seeds/production.json

# Convert multiple tables
python sql_to_seed.py -i multi_table_dump.sql -o seeds/all_tables.json

Input Example

dump.sql:

INSERT INTO products (id, name, price, created_at)
VALUES
  (1, 'Product A', 99.99, '2024-01-01'::timestamp),
  (2, 'Product B', 149.50, '2024-01-02'::timestamp);

INSERT INTO categories (id, name, slug)
VALUES
  ('uuid-123'::uuid, 'Electronics', 'electronics'),
  ('uuid-456'::uuid, 'Books', 'books');

Output Example

seed.json:

[
  {
    "collection": "products",
    "meta": {
      "insert_order": 1,
      "create": true,
      "update": true,
      "delete": true,
      "preserve_ids": true
    },
    "data": [
      {
        "id": 1,
        "name": "Product A",
        "price": 99.99,
        "created_at": "2024-01-01",
        "_sync_id": "products-1"
      },
      {
        "id": 2,
        "name": "Product B",
        "price": 149.5,
        "created_at": "2024-01-02",
        "_sync_id": "products-2"
      }
    ]
  },
  {
    "collection": "categories",
    "meta": {
      "insert_order": 1,
      "create": true,
      "update": true,
      "delete": true,
      "preserve_ids": true
    },
    "data": [
      {
        "id": "uuid-123",
        "name": "Electronics",
        "slug": "electronics",
        "_sync_id": "categories-uuid-123"
      },
      {
        "id": "uuid-456",
        "name": "Books",
        "slug": "books",
        "_sync_id": "categories-uuid-456"
      }
    ]
  }
]

How It Works

Normalization: Removes comments, standardizes whitespace
Statement Splitting: Separates multiple INSERT statements
Parsing: Extracts table name, columns, and values
Type Conversion: Converts SQL literals to JSON types
Seed Generation: Creates Directus-compatible JSON structure
Dependency Ordering: Smart ordering for tables with relationships

Source Code

sql_to_seed.py (Full implementation - 260+ lines)

import re
import json
import argparse
from typing import List, Dict, Any, Tuple, Optional
from pathlib import Path

def normalize_sql_content(sql_content: str) -> str:
    """Normalize SQL: remove comments, standardize whitespace, split statements."""
    # Remove comments
    sql_content = re.sub(r'/\*.*?\*/', '', sql_content, flags=re.DOTALL)
    sql_content = re.sub(r'--.*$', '', sql_content, flags=re.MULTILINE)

    # Collapse and standardize
    lines = [line.strip() for line in sql_content.splitlines() if line.strip()]
    sql_content = ' '.join(lines)
    sql_content = re.sub(r'\s*,\s*', ',', sql_content)
    sql_content = re.sub(r'\s*\(\s*', '(', sql_content)
    sql_content = re.sub(r'\s*\)\s*', ')', sql_content)
    sql_content = re.sub(r'\s+', ' ', sql_content).strip()

    # Split by semicolon (respecting quotes)
    statements = []
    current_statement = []
    in_quotes = False
    for char in sql_content:
        current_statement.append(char)
        if char == "'":
            in_quotes = not in_quotes
        elif char == ';' and not in_quotes:
            statements.append("".join(current_statement).strip())
            current_statement = []

    if current_statement:
        statements.append("".join(current_statement).strip())

    return statements

def parse_value(value_str: str) -> Any:
    """Parse SQL value: handles casts, strings, nulls, numbers."""
    value_str = value_str.strip()

    # Handle PostgreSQL casts: 'value'::type
    cast_match = re.match(r"^'(.*?)'(?:::[\w\s]+)?$", value_str)
    if cast_match:
        return cast_match.group(1)

    # Simple quoted string
    if value_str.startswith("'") and value_str.endswith("'"):
        return value_str[1:-1]

    # NULL
    if value_str.lower() == 'null':
        return None

    # Numbers
    try:
        return int(value_str)
    except ValueError:
        try:
            return float(value_str)
        except ValueError:
            return value_str

def parse_insert_statement(statement: str) -> Optional[Tuple[str, List[str], List[List[Any]]]]:
    """Parse INSERT INTO statement, return (table, columns, values)."""
    if not statement.lower().startswith('insert into'):
        return None

    # Extract table name
    table_match = re.search(r'insert\s+into\s+("?)([\w\.]+)\1', statement, re.IGNORECASE)
    if not table_match:
        return None
    table_name = table_match.group(2)

    # Extract columns
    columns_match = re.search(r'\((.*?)\)\s*values', statement, re.IGNORECASE | re.DOTALL)
    if not columns_match:
        return None
    columns = [col.strip().strip('"') for col in columns_match.group(1).split(',')]

    # Extract value tuples
    values_part = statement[columns_match.end():].strip()
    value_tuples_str = re.findall(r'\((.*?)\)', values_part)

    parsed_values = []
    for tuple_str in value_tuples_str:
        values_in_tuple = []
        current_val = []
        in_quotes = False
        for char in tuple_str:
            if char == "'":
                in_quotes = not in_quotes
                current_val.append(char)
            elif char == ',' and not in_quotes:
                values_in_tuple.append("".join(current_val).strip())
                current_val = []
            else:
                current_val.append(char)
        values_in_tuple.append("".join(current_val).strip())

        if len(values_in_tuple) == len(columns):
            parsed_values.append([parse_value(v) for v in values_in_tuple])

    return table_name, columns, parsed_values if columns and parsed_values else None

def create_seed_format(table_name: str, columns: List[str], values_list: List[List[Any]]) -> Dict[str, Any]:
    """Create Directus seed JSON structure."""
    pk_column = 'id' if 'id' in columns else ('uuid' if 'uuid' in columns else columns[0])
    pk_index = columns.index(pk_column)

    items = []
    for row_values in values_list:
        item = dict(zip(columns, row_values))
        pk_value = row_values[pk_index]
        item['_sync_id'] = f"{table_name}-{pk_value}" if pk_value else f"{table_name}-item-{len(items)}"
        items.append(item)

    return {
        "collection": table_name,
        "meta": {
            "insert_order": 1,
            "create": True,
            "update": True,
            "delete": True,
            "preserve_ids": True,
            "ignore_on_update": []
        },
        "data": items
    }

def process_sql_file(input_file: Path, output_file: Path) -> None:
    """Main processing function."""
    with open(input_file, 'r', encoding='utf-8') as f:
        sql_content = f.read()

    statements = normalize_sql_content(sql_content)
    all_seeds = []

    for statement in statements:
        parsed = parse_insert_statement(statement)
        if parsed:
            table_name, columns, values = parsed
            print(f"Processing {table_name}: {len(values)} rows")
            all_seeds.append(create_seed_format(table_name, columns, values))

    if not all_seeds:
        print("Error: No valid INSERT statements found")
        return

    # Smart ordering for translations
    if len(all_seeds) > 1:
        for seed in all_seeds:
            if 'translations' in seed['collection'].lower():
                seed['meta']['insert_order'] = 2

    output_data = all_seeds if len(all_seeds) > 1 else all_seeds[0]

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print(f"✓ Converted {input_file} → {output_file}")

def main():
    parser = argparse.ArgumentParser(description='Convert SQL INSERT to Directus seed')
    parser.add_argument('-i', '--input', type=Path, required=True, help='Input SQL file')
    parser.add_argument('-o', '--output', type=Path, required=True, help='Output JSON file')
    args = parser.parse_args()

    if not args.input.exists():
        print(f"Error: {args.input} not found")
        return

    args.output.parent.mkdir(parents=True, exist_ok=True)
    process_sql_file(args.input, args.output)

if __name__ == "__main__":
    main()

Limitations

Basic regex parsing (not a full SQL parser)
May struggle with heavily nested parentheses or complex escaped quotes
Dependency ordering is heuristic (manual adjustment may be needed)
Tested primarily with PostgreSQL dumps