refactor: merge categorization scripts

Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted.
2025-08-12 14:29:36 +05:30
parent 6b641ec14f
commit 7d6e4e2222
4 changed files with 328 additions and 400 deletions
--- a/categorize_workflows.py
+++ b/categorize_workflows.py
@@ -1,216 +0,0 @@
 #!/usr/bin/env python3
 """
 Script to categorize uncategorized n8n workflows based on filename patterns.
 This will help reduce the count of uncategorized workflows.
 """
 import json
 from collections import defaultdict
 def load_categories():
    """Load the search categories file."""
    with open('context/search_categories.json', 'r', encoding='utf-8') as f:
        return json.load(f)
 def load_unique_categories():
    """Load the unique categories list."""
    with open('context/unique_categories.json', 'r', encoding='utf-8') as f:
        return json.load(f)
 def categorize_by_filename(filename):
    """
    Categorize workflow based on filename patterns.
    Returns the most likely category or None if uncertain.
    """
    filename_lower = filename.lower()
    # Security & Authentication
    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
        return "Technical Infrastructure & DevOps"
    # Data Processing & File Operations
    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile']):
        return "Data Processing & Analysis"
    # Utility & Business Process Automation
    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate']):
        return "Business Process Automation"
    # Webhook & API related
    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http']):
        return "Web Scraping & Data Extraction"
    # Form & Data Collection
    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
        return "Data Processing & Analysis"
    # Local file operations
    if any(word in filename_lower for word in ['localfile', 'filemaker']):
        return "Cloud Storage & File Management"
    # Database operations
    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
        return "Data Processing & Analysis"
    # AI & Machine Learning
    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus']):
        return "AI Agent Development"
    # E-commerce specific
    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
        return "E-commerce & Retail"
    # Social media specific
    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
        return "Social Media Management"
    # Customer support
    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
        return "Communication & Messaging"
    # Analytics & Tracking
    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
        return "Data Processing & Analysis"
    # Development tools
    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins']):
        return "Technical Infrastructure & DevOps"
    # CRM & Sales tools
    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit']):
        return "CRM & Sales"
    # Marketing tools
    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist']):
        return "Marketing & Advertising Automation"
    # Project management
    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion']):
        return "Project Management"
    # Communication
    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio']):
        return "Communication & Messaging"
    # Cloud storage
    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3']):
        return "Cloud Storage & File Management"
    # Creative tools
    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
        return "Creative Design Automation"
    # Video & content
    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
        return "Creative Content & Video Automation"
    # Financial tools
    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
        return "Financial & Accounting"
    # Weather & external APIs
    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
        return "Web Scraping & Data Extraction"
    return None
 def main():
    """Main function to categorize workflows."""
    print("Loading workflow categories...")
    workflows = load_categories()
    unique_categories = load_unique_categories()
    print(f"Total workflows: {len(workflows)}")
    # Count current categories
    category_counts = defaultdict(int)
    uncategorized_count = 0
    for workflow in workflows:
        if workflow['category']:
            category_counts[workflow['category']] += 1
        else:
            uncategorized_count += 1
    print(f"\nCurrent category distribution:")
    for category, count in sorted(category_counts.items()):
        print(f"  {category}: {count}")
    print(f"  Uncategorized: {uncategorized_count}")
    # Identify uncategorized workflows
    uncategorized_workflows = [w for w in workflows if not w['category']]
    print(f"\nAnalyzing {len(uncategorized_workflows)} uncategorized workflows...")
    # Categorize based on filename patterns
    suggested_categories = {}
    uncertain_workflows = []
    for workflow in uncategorized_workflows:
        filename = workflow['filename']
        suggested_category = categorize_by_filename(filename)
        if suggested_category:
            suggested_categories[filename] = suggested_category
        else:
            uncertain_workflows.append(filename)
    print(f"\nSuggested categorizations: {len(suggested_categories)}")
    print(f"Still uncertain: {len(uncategorized_workflows)}")
    # Show suggested categorizations
    if suggested_categories:
        print("\nSuggested categorizations:")
        for filename, category in sorted(suggested_categories.items()):
            print(f"  {filename} → {category}")
    # Show uncertain workflows
    if uncertain_workflows:
        print(f"\nWorkflows that need manual review:")
        for filename in sorted(uncertain_workflows):
            print(f"  {filename}")
    # Calculate potential improvement
    potential_categorized = len(suggested_categories)
    new_uncategorized_count = uncategorized_count - potential_categorized
    print(f"\nPotential improvement:")
    print(f"  Current uncategorized: {uncategorized_count}")
    print(f"  After auto-categorization: {new_uncategorized_count}")
    print(f"  Reduction: {potential_categorized} workflows ({potential_categorized/uncategorized_count*100:.1f}%)")
    # Ask if user wants to apply suggestions
    if suggested_categories:
        response = input(f"\nWould you like to apply these {len(suggested_categories)} suggested categorizations? (y/n): ")
        if response.lower() in ['y', 'yes']:
            # Apply the categorizations
            for workflow in workflows:
                if workflow['filename'] in suggested_categories:
                    workflow['category'] = suggested_categories[workflow['filename']]
            # Save the updated file
            with open('context/search_categories.json', 'w', encoding='utf-8') as f:
                json.dump(workflows, f, indent=2, ensure_ascii=False)
            print("✅ Categorizations applied and saved!")
            # Show new distribution
            new_category_counts = defaultdict(int)
            new_uncategorized_count = 0
            for workflow in workflows:
                if workflow['category']:
                    new_category_counts[workflow['category']] += 1
                else:
                    new_uncategorized_count += 1
            print(f"\nNew category distribution:")
            for category, count in sorted(new_category_counts.items()):
                print(f"  {category}: {count}")
            print(f"  Uncategorized: {new_uncategorized_count}")
        else:
            print("No changes applied.")
 if __name__ == "__main__":
    main()
--- a/context/search_categories.json
+++ b/context/search_categories.json
--- a/create_categories.py
+++ b/create_categories.py
@@ -47,6 +47,103 @@ def find_matching_category(tokens, integration_to_category):
    return ""
 def categorize_by_filename(filename):
    """
    Categorize workflow based on filename patterns.
    Returns the most likely category or None if uncertain.
    """
    filename_lower = filename.lower()
    # Security & Authentication
    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
        return "Technical Infrastructure & DevOps"
    # Data Processing & File Operations
    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile']):
        return "Data Processing & Analysis"
    # Utility & Business Process Automation
    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
        return "Business Process Automation"
    # Webhook & API related
    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
        return "Web Scraping & Data Extraction"
    # Form & Data Collection
    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
        return "Data Processing & Analysis"
    # Local file operations
    if any(word in filename_lower for word in ['localfile', 'filemaker']):
        return "Cloud Storage & File Management"
    # Database operations
    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
        return "Data Processing & Analysis"
    # AI & Machine Learning
    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
        return "AI Agent Development"
    # E-commerce specific
    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
        return "E-commerce & Retail"
    # Social media specific
    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
        return "Social Media Management"
    # Customer support
    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
        return "Communication & Messaging"
    # Analytics & Tracking
    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
        return "Data Processing & Analysis"
    # Development tools
    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
        return "Technical Infrastructure & DevOps"
    # CRM & Sales tools
    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
        return "CRM & Sales"
    # Marketing tools
    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
        return "Marketing & Advertising Automation"
    # Project management
    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
        return "Project Management"
    # Communication
    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
        return "Communication & Messaging"
    # Cloud storage
    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
        return "Cloud Storage & File Management"
    # Creative tools
    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
        return "Creative Design Automation"
    # Video & content
    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
        return "Creative Content & Video Automation"
    # Financial tools
    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
        return "Financial & Accounting"
    # Weather & external APIs
    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
        return "Web Scraping & Data Extraction"
    return ""
 def main():
    # Load definition categories
    integration_to_category = load_def_categories()
@@ -72,6 +169,11 @@ def main():
            "category": category
        })
    # Second pass for categorization
    for item in search_categories:
        if not item['category']:
            item['category'] = categorize_by_filename(item['filename'])
    # Sort by filename for consistency
    search_categories.sort(key=lambda x: x['filename'])
--- a/import_workflows.py
+++ b/import_workflows.py
@@ -10,6 +10,21 @@ import sys
 from pathlib import Path
 from typing import List, Dict, Any
 from categorize_workflows import categorize_by_filename
 def load_categories():
    """Load the search categories file."""
    try:
        with open('context/search_categories.json', 'r', encoding='utf-8') as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return []
 def save_categories(data):
    """Save the search categories file."""
    with open('context/search_categories.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
 class WorkflowImporter:
    """Import n8n workflows with progress tracking and error handling."""
@@ -56,6 +71,32 @@ class WorkflowImporter:
            if result.returncode == 0:
                print(f"✅ Imported: {file_path.name}")
                # Categorize the workflow and update search_categories.json
                suggested_category = categorize_by_filename(file_path.name)
                all_workflows_data = load_categories()
                found = False
                for workflow_entry in all_workflows_data:
                    if workflow_entry.get('filename') == file_path.name:
                        workflow_entry['category'] = suggested_category
                        found = True
                        break
                if not found:
                    # Add new workflow entry if not found (e.g., first import)
                    all_workflows_data.append({
                        "filename": file_path.name,
                        "category": suggested_category,
                        "name": file_path.stem, # Assuming workflow name is filename without extension
                        "description": "", # Placeholder, can be updated manually
                        "nodes": [] # Placeholder, can be updated manually
                    })
                save_categories(all_workflows_data)
                print(f"  Categorized '{file_path.name}' as '{suggested_category or 'Uncategorized'}'")
                return True
            else:
                error_msg = result.stderr.strip() or result.stdout.strip()
@@ -141,6 +182,7 @@ def check_n8n_available() -> bool:
 def main():
    """Main entry point."""
    sys.stdout.reconfigure(encoding='utf-8')
    print("🔧 N8N Workflow Importer")
    print("=" * 40)