Python-Skript zum Finden & Zählen von Wort-Token in Dateinamen rekursiv

Dieses Skript kann verwendet werden, um Ihnen zu zeigen, welche Wörter in Dateinamen-Komponenten in einem beliebigen Verzeichnisbaum existieren. Es zeigt, wie oft jedes Wort in den Dateinamen vorkommt. Das Skript verwendet die os- und collections-Module, um den Verzeichnisbaum zu durchlaufen und die Wörter zu zählen.

filename_token_counter.py
#!/usr/bin/env python3
import os
import re
from collections import Counter
import argparse

def list_files_recursively(directory):
    """List all files recursively in the given directory."""
    for root, _, files in os.walk(directory):
        for file in files:
            yield os.path.join(root, file)

def process_filename(filename, lower_tokens=False, ignore_numeric=False):
    """Strip extensions, tokenize the filename, and return tokens."""
    # Strip extension
    name_without_extension = os.path.splitext(os.path.basename(filename))[0]

    # Tokenize by whitespace, dashes, and underscores
    tokens = re.split(r'[\s\-_]+', name_without_extension)

    # Remove empty tokens
    tokens = [token for token in tokens if token]

    # Optionally convert to lowercase
    if lower_tokens:
        tokens = [token.lower() for token in tokens]

    # Optionally ignore all-numeric tokens
    if ignore_numeric:
        tokens = [token for token in tokens if not token.isnumeric()]

    return tokens

def build_token_counter(directory, lower_tokens=False, ignore_numeric=False):
    """Build a Counter of tokens from all filenames in the directory."""
    counter = Counter()
    for file_path in list_files_recursively(directory):
        tokens = process_filename(file_path, lower_tokens, ignore_numeric)
        counter.update(tokens)
    return counter

def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description="Tokenize filenames and count token occurrences.")
    parser.add_argument("directory", help="The directory to scan.")
    parser.add_argument("-s", action="store_true", help="Case-sensitive tokenization (default is case-insensitive).")
    parser.add_argument("-N", action="store_true", help="Ignore all-numeric tokens.")
    args = parser.parse_args()

    # Build token counter
    token_counter = build_token_counter(args.directory, lower_tokens=not args.s, ignore_numeric=args.N)

    # Print tokens sorted with the most common at the bottom
    print("\nToken counts (most common at the bottom):")
    for token, count in sorted(token_counter.items(), key=lambda item: (item[1], item[0])):
        print(f"{token}: {count}")

if __name__ == "__main__":
    main()

Check out similar posts by category: Python