git_cleaner.py: Recursively run git gc on all your repositories

This script, git_cleaner.py, is designed to recursively search through a specified directory (or the current directory by default) for Git repositories and run git gc on each one. It captures the size and file count of the .git directory before and after running git gc, allowing you to see how much space was reclaimed and how many files were removed. The script also provides a final summary of all repositories processed.

git_cleaner.py
#!/usr/bin/env python3
import os
import subprocess
import argparse
import sys

def get_dir_stats(path):
    """Returns a tuple of (total_size_bytes, total_file_count)."""
    total_size = 0
    file_count = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
                file_count += 1
    return total_size, file_count

def format_size(bytes_size):
    """Convert bytes to a human-readable string."""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if abs(bytes_size) < 1024:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024
    return f"{bytes_size:.2f} TB"

def main():
    parser = argparse.ArgumentParser(
        description="Recursively run 'git gc', reporting space and files saved."
    )
    parser.add_argument(
        "target_dir",
        nargs="?",
        default=".",
        help="Directory to search (default: '.')"
    )

    args, unknown_args = parser.parse_known_args()
    target_path = os.path.abspath(args.target_dir)

    if not os.path.isdir(target_path):
        print(f"Error: {target_path} is not a directory.")
        sys.exit(1)

    total_saved_bytes = 0
    total_files_removed = 0
    repos_processed = 0

    print(f"--- Scanning: {target_path} ---")

    for root, dirs, _ in os.walk(target_path):
        if ".git" in dirs:
            repos_processed += 1
            git_dir = os.path.join(root, ".git")

            # 1. Capture stats before
            size_before, files_before = get_dir_stats(git_dir)

            print(f"\n[{repos_processed}] Optimizing: {root}")

            try:
                # 2. Run git gc
                # Note: We use -C to run git in the specific directory
                cmd = ["git", "-C", root, "gc"] + unknown_args
                subprocess.run(cmd, check=True, capture_output=True)

                # 3. Capture stats after
                size_after, files_after = get_dir_stats(git_dir)

                saved_size = size_before - size_after
                removed_files = files_before - files_after

                total_saved_bytes += max(0, saved_size)
                total_files_removed += max(0, removed_files)

                print(f"    Files: {files_before} -> {files_after} ({removed_files} removed)")
                print(f"    Size:  {format_size(size_before)} -> {format_size(size_after)} ({format_size(saved_size)} saved)")

            except subprocess.CalledProcessError as e:
                print(f"    [!] Error: {e.stderr.decode().strip()}")

            # Efficiency: don't walk into the .git folder we just processed
            dirs.remove(".git")

    # Final Summary Table
    print("\n" + "="*45)
    print(f"{'FINAL SUMMARY':^45}")
    print("-" * 45)
    print(f" Repositories Processed : {repos_processed}")
    print(f" Total Space Reclaimed  : {format_size(total_saved_bytes)}")
    print(f" Total Files Removed    : {total_files_removed}")
    print("="*45)

if __name__ == "__main__":
    main()

Example output

example.txt
[...]

[433] Optimizing: /home/uli/dev/FlareDNS
    Files: 73 -> 33 (40 removed)
    Size:  70.14 KB -> 47.03 KB (23.10 KB saved)

=============================================
                FINAL SUMMARY
---------------------------------------------
 Repositories Processed : 433
 Total Space Reclaimed  : 238.14 MB
 Total Files Removed    : 21612
=============================================

Check out similar posts by category: Git, Python