haikal/slug_data.py

import json
import time
from pathlib import Path
from slugify import slugify


def process_json_file(input_file, output_file=None, batch_size=10000):
    """Add slugs to JSON data file with optimal performance"""
    if output_file is None:
        output_file = f"{Path(input_file).stem}_with_slugs.json"

    start_time = time.time()

    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    total = len(data)
    processed = 0

    print(f"Processing {total} records...")

    for item in data:
        # Generate slug from name field
        name = item["fields"].get("name", "")
        pk = item["pk"]

        if name:
            slug = slugify(name)[:50]  # Truncate to 50 chars
            # Append PK to ensure uniqueness
            item["fields"]["slug"] = f"{slug}-{pk}"
        else:
            # Fallback to model-pk if name is empty
            model_name = item["model"].split(".")[-1]
            item["fields"]["slug"] = f"{model_name}-{pk}"

        processed += 1
        if processed % batch_size == 0:
            print(f"Processed {processed}/{total} records...")

    # Save the modified data
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Completed in {time.time() - start_time:.2f} seconds")
    print(f"Output saved to {output_file}")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("input_file", help="Path to input JSON file")
    parser.add_argument("-o", "--output", help="Output file path")
    parser.add_argument(
        "-b", "--batch", type=int, default=10000, help="Progress reporting batch size"
    )
    args = parser.parse_args()

    process_json_file(args.input_file, args.output, args.batch)