refactor: merge categorization scripts
Merge the logic from categorize_workflows.py into create_categories.py to simplify the categorization process. The categorize_workflows.py script is now deleted.
This commit is contained in:
@@ -1,216 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Script to categorize uncategorized n8n workflows based on filename patterns.
|
|
||||||
This will help reduce the count of uncategorized workflows.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
def load_categories():
|
|
||||||
"""Load the search categories file."""
|
|
||||||
with open('context/search_categories.json', 'r', encoding='utf-8') as f:
|
|
||||||
return json.load(f)
|
|
||||||
|
|
||||||
def load_unique_categories():
|
|
||||||
"""Load the unique categories list."""
|
|
||||||
with open('context/unique_categories.json', 'r', encoding='utf-8') as f:
|
|
||||||
return json.load(f)
|
|
||||||
|
|
||||||
def categorize_by_filename(filename):
|
|
||||||
"""
|
|
||||||
Categorize workflow based on filename patterns.
|
|
||||||
Returns the most likely category or None if uncertain.
|
|
||||||
"""
|
|
||||||
filename_lower = filename.lower()
|
|
||||||
|
|
||||||
# Security & Authentication
|
|
||||||
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
|
|
||||||
return "Technical Infrastructure & DevOps"
|
|
||||||
|
|
||||||
# Data Processing & File Operations
|
|
||||||
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile']):
|
|
||||||
return "Data Processing & Analysis"
|
|
||||||
|
|
||||||
# Utility & Business Process Automation
|
|
||||||
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate']):
|
|
||||||
return "Business Process Automation"
|
|
||||||
|
|
||||||
# Webhook & API related
|
|
||||||
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http']):
|
|
||||||
return "Web Scraping & Data Extraction"
|
|
||||||
|
|
||||||
# Form & Data Collection
|
|
||||||
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
|
|
||||||
return "Data Processing & Analysis"
|
|
||||||
|
|
||||||
# Local file operations
|
|
||||||
if any(word in filename_lower for word in ['localfile', 'filemaker']):
|
|
||||||
return "Cloud Storage & File Management"
|
|
||||||
|
|
||||||
# Database operations
|
|
||||||
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
|
|
||||||
return "Data Processing & Analysis"
|
|
||||||
|
|
||||||
# AI & Machine Learning
|
|
||||||
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus']):
|
|
||||||
return "AI Agent Development"
|
|
||||||
|
|
||||||
# E-commerce specific
|
|
||||||
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
|
|
||||||
return "E-commerce & Retail"
|
|
||||||
|
|
||||||
# Social media specific
|
|
||||||
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
|
|
||||||
return "Social Media Management"
|
|
||||||
|
|
||||||
# Customer support
|
|
||||||
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
|
|
||||||
return "Communication & Messaging"
|
|
||||||
|
|
||||||
# Analytics & Tracking
|
|
||||||
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
|
|
||||||
return "Data Processing & Analysis"
|
|
||||||
|
|
||||||
# Development tools
|
|
||||||
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins']):
|
|
||||||
return "Technical Infrastructure & DevOps"
|
|
||||||
|
|
||||||
# CRM & Sales tools
|
|
||||||
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit']):
|
|
||||||
return "CRM & Sales"
|
|
||||||
|
|
||||||
# Marketing tools
|
|
||||||
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist']):
|
|
||||||
return "Marketing & Advertising Automation"
|
|
||||||
|
|
||||||
# Project management
|
|
||||||
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion']):
|
|
||||||
return "Project Management"
|
|
||||||
|
|
||||||
# Communication
|
|
||||||
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio']):
|
|
||||||
return "Communication & Messaging"
|
|
||||||
|
|
||||||
# Cloud storage
|
|
||||||
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3']):
|
|
||||||
return "Cloud Storage & File Management"
|
|
||||||
|
|
||||||
# Creative tools
|
|
||||||
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
|
|
||||||
return "Creative Design Automation"
|
|
||||||
|
|
||||||
# Video & content
|
|
||||||
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
|
|
||||||
return "Creative Content & Video Automation"
|
|
||||||
|
|
||||||
# Financial tools
|
|
||||||
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
|
|
||||||
return "Financial & Accounting"
|
|
||||||
|
|
||||||
# Weather & external APIs
|
|
||||||
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
|
|
||||||
return "Web Scraping & Data Extraction"
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main function to categorize workflows."""
|
|
||||||
print("Loading workflow categories...")
|
|
||||||
workflows = load_categories()
|
|
||||||
unique_categories = load_unique_categories()
|
|
||||||
|
|
||||||
print(f"Total workflows: {len(workflows)}")
|
|
||||||
|
|
||||||
# Count current categories
|
|
||||||
category_counts = defaultdict(int)
|
|
||||||
uncategorized_count = 0
|
|
||||||
|
|
||||||
for workflow in workflows:
|
|
||||||
if workflow['category']:
|
|
||||||
category_counts[workflow['category']] += 1
|
|
||||||
else:
|
|
||||||
uncategorized_count += 1
|
|
||||||
|
|
||||||
print(f"\nCurrent category distribution:")
|
|
||||||
for category, count in sorted(category_counts.items()):
|
|
||||||
print(f" {category}: {count}")
|
|
||||||
print(f" Uncategorized: {uncategorized_count}")
|
|
||||||
|
|
||||||
# Identify uncategorized workflows
|
|
||||||
uncategorized_workflows = [w for w in workflows if not w['category']]
|
|
||||||
|
|
||||||
print(f"\nAnalyzing {len(uncategorized_workflows)} uncategorized workflows...")
|
|
||||||
|
|
||||||
# Categorize based on filename patterns
|
|
||||||
suggested_categories = {}
|
|
||||||
uncertain_workflows = []
|
|
||||||
|
|
||||||
for workflow in uncategorized_workflows:
|
|
||||||
filename = workflow['filename']
|
|
||||||
suggested_category = categorize_by_filename(filename)
|
|
||||||
|
|
||||||
if suggested_category:
|
|
||||||
suggested_categories[filename] = suggested_category
|
|
||||||
else:
|
|
||||||
uncertain_workflows.append(filename)
|
|
||||||
|
|
||||||
print(f"\nSuggested categorizations: {len(suggested_categories)}")
|
|
||||||
print(f"Still uncertain: {len(uncategorized_workflows)}")
|
|
||||||
|
|
||||||
# Show suggested categorizations
|
|
||||||
if suggested_categories:
|
|
||||||
print("\nSuggested categorizations:")
|
|
||||||
for filename, category in sorted(suggested_categories.items()):
|
|
||||||
print(f" {filename} → {category}")
|
|
||||||
|
|
||||||
# Show uncertain workflows
|
|
||||||
if uncertain_workflows:
|
|
||||||
print(f"\nWorkflows that need manual review:")
|
|
||||||
for filename in sorted(uncertain_workflows):
|
|
||||||
print(f" {filename}")
|
|
||||||
|
|
||||||
# Calculate potential improvement
|
|
||||||
potential_categorized = len(suggested_categories)
|
|
||||||
new_uncategorized_count = uncategorized_count - potential_categorized
|
|
||||||
|
|
||||||
print(f"\nPotential improvement:")
|
|
||||||
print(f" Current uncategorized: {uncategorized_count}")
|
|
||||||
print(f" After auto-categorization: {new_uncategorized_count}")
|
|
||||||
print(f" Reduction: {potential_categorized} workflows ({potential_categorized/uncategorized_count*100:.1f}%)")
|
|
||||||
|
|
||||||
# Ask if user wants to apply suggestions
|
|
||||||
if suggested_categories:
|
|
||||||
response = input(f"\nWould you like to apply these {len(suggested_categories)} suggested categorizations? (y/n): ")
|
|
||||||
|
|
||||||
if response.lower() in ['y', 'yes']:
|
|
||||||
# Apply the categorizations
|
|
||||||
for workflow in workflows:
|
|
||||||
if workflow['filename'] in suggested_categories:
|
|
||||||
workflow['category'] = suggested_categories[workflow['filename']]
|
|
||||||
|
|
||||||
# Save the updated file
|
|
||||||
with open('context/search_categories.json', 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(workflows, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
print("✅ Categorizations applied and saved!")
|
|
||||||
|
|
||||||
# Show new distribution
|
|
||||||
new_category_counts = defaultdict(int)
|
|
||||||
new_uncategorized_count = 0
|
|
||||||
|
|
||||||
for workflow in workflows:
|
|
||||||
if workflow['category']:
|
|
||||||
new_category_counts[workflow['category']] += 1
|
|
||||||
else:
|
|
||||||
new_uncategorized_count += 1
|
|
||||||
|
|
||||||
print(f"\nNew category distribution:")
|
|
||||||
for category, count in sorted(new_category_counts.items()):
|
|
||||||
print(f" {category}: {count}")
|
|
||||||
print(f" Uncategorized: {new_uncategorized_count}")
|
|
||||||
else:
|
|
||||||
print("No changes applied.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -47,6 +47,103 @@ def find_matching_category(tokens, integration_to_category):
|
|||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def categorize_by_filename(filename):
|
||||||
|
"""
|
||||||
|
Categorize workflow based on filename patterns.
|
||||||
|
Returns the most likely category or None if uncertain.
|
||||||
|
"""
|
||||||
|
filename_lower = filename.lower()
|
||||||
|
|
||||||
|
# Security & Authentication
|
||||||
|
if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
|
||||||
|
return "Technical Infrastructure & DevOps"
|
||||||
|
|
||||||
|
# Data Processing & File Operations
|
||||||
|
if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile']):
|
||||||
|
return "Data Processing & Analysis"
|
||||||
|
|
||||||
|
# Utility & Business Process Automation
|
||||||
|
if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
|
||||||
|
return "Business Process Automation"
|
||||||
|
|
||||||
|
# Webhook & API related
|
||||||
|
if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
|
||||||
|
return "Web Scraping & Data Extraction"
|
||||||
|
|
||||||
|
# Form & Data Collection
|
||||||
|
if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
|
||||||
|
return "Data Processing & Analysis"
|
||||||
|
|
||||||
|
# Local file operations
|
||||||
|
if any(word in filename_lower for word in ['localfile', 'filemaker']):
|
||||||
|
return "Cloud Storage & File Management"
|
||||||
|
|
||||||
|
# Database operations
|
||||||
|
if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
|
||||||
|
return "Data Processing & Analysis"
|
||||||
|
|
||||||
|
# AI & Machine Learning
|
||||||
|
if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
|
||||||
|
return "AI Agent Development"
|
||||||
|
|
||||||
|
# E-commerce specific
|
||||||
|
if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
|
||||||
|
return "E-commerce & Retail"
|
||||||
|
|
||||||
|
# Social media specific
|
||||||
|
if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
|
||||||
|
return "Social Media Management"
|
||||||
|
|
||||||
|
# Customer support
|
||||||
|
if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
|
||||||
|
return "Communication & Messaging"
|
||||||
|
|
||||||
|
# Analytics & Tracking
|
||||||
|
if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
|
||||||
|
return "Data Processing & Analysis"
|
||||||
|
|
||||||
|
# Development tools
|
||||||
|
if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
|
||||||
|
return "Technical Infrastructure & DevOps"
|
||||||
|
|
||||||
|
# CRM & Sales tools
|
||||||
|
if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
|
||||||
|
return "CRM & Sales"
|
||||||
|
|
||||||
|
# Marketing tools
|
||||||
|
if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
|
||||||
|
return "Marketing & Advertising Automation"
|
||||||
|
|
||||||
|
# Project management
|
||||||
|
if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
|
||||||
|
return "Project Management"
|
||||||
|
|
||||||
|
# Communication
|
||||||
|
if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
|
||||||
|
return "Communication & Messaging"
|
||||||
|
|
||||||
|
# Cloud storage
|
||||||
|
if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
|
||||||
|
return "Cloud Storage & File Management"
|
||||||
|
|
||||||
|
# Creative tools
|
||||||
|
if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
|
||||||
|
return "Creative Design Automation"
|
||||||
|
|
||||||
|
# Video & content
|
||||||
|
if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
|
||||||
|
return "Creative Content & Video Automation"
|
||||||
|
|
||||||
|
# Financial tools
|
||||||
|
if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
|
||||||
|
return "Financial & Accounting"
|
||||||
|
|
||||||
|
# Weather & external APIs
|
||||||
|
if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
|
||||||
|
return "Web Scraping & Data Extraction"
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Load definition categories
|
# Load definition categories
|
||||||
integration_to_category = load_def_categories()
|
integration_to_category = load_def_categories()
|
||||||
@@ -72,6 +169,11 @@ def main():
|
|||||||
"category": category
|
"category": category
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Second pass for categorization
|
||||||
|
for item in search_categories:
|
||||||
|
if not item['category']:
|
||||||
|
item['category'] = categorize_by_filename(item['filename'])
|
||||||
|
|
||||||
# Sort by filename for consistency
|
# Sort by filename for consistency
|
||||||
search_categories.sort(key=lambda x: x['filename'])
|
search_categories.sort(key=lambda x: x['filename'])
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,21 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
from categorize_workflows import categorize_by_filename
|
||||||
|
|
||||||
|
|
||||||
|
def load_categories():
|
||||||
|
"""Load the search categories file."""
|
||||||
|
try:
|
||||||
|
with open('context/search_categories.json', 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
except (FileNotFoundError, json.JSONDecodeError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
def save_categories(data):
|
||||||
|
"""Save the search categories file."""
|
||||||
|
with open('context/search_categories.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
class WorkflowImporter:
|
class WorkflowImporter:
|
||||||
"""Import n8n workflows with progress tracking and error handling."""
|
"""Import n8n workflows with progress tracking and error handling."""
|
||||||
@@ -56,6 +71,32 @@ class WorkflowImporter:
|
|||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
print(f"✅ Imported: {file_path.name}")
|
print(f"✅ Imported: {file_path.name}")
|
||||||
|
|
||||||
|
# Categorize the workflow and update search_categories.json
|
||||||
|
suggested_category = categorize_by_filename(file_path.name)
|
||||||
|
|
||||||
|
all_workflows_data = load_categories()
|
||||||
|
|
||||||
|
found = False
|
||||||
|
for workflow_entry in all_workflows_data:
|
||||||
|
if workflow_entry.get('filename') == file_path.name:
|
||||||
|
workflow_entry['category'] = suggested_category
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
# Add new workflow entry if not found (e.g., first import)
|
||||||
|
all_workflows_data.append({
|
||||||
|
"filename": file_path.name,
|
||||||
|
"category": suggested_category,
|
||||||
|
"name": file_path.stem, # Assuming workflow name is filename without extension
|
||||||
|
"description": "", # Placeholder, can be updated manually
|
||||||
|
"nodes": [] # Placeholder, can be updated manually
|
||||||
|
})
|
||||||
|
|
||||||
|
save_categories(all_workflows_data)
|
||||||
|
print(f" Categorized '{file_path.name}' as '{suggested_category or 'Uncategorized'}'")
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
error_msg = result.stderr.strip() or result.stdout.strip()
|
error_msg = result.stderr.strip() or result.stdout.strip()
|
||||||
@@ -141,6 +182,7 @@ def check_n8n_available() -> bool:
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main entry point."""
|
"""Main entry point."""
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
print("🔧 N8N Workflow Importer")
|
print("🔧 N8N Workflow Importer")
|
||||||
print("=" * 40)
|
print("=" * 40)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user