Merge pull request #83 from PraveenMudalgeri/add-folder-support

fix: enable recursive workflow discovery and normalize categories (Issue #82)
2025-08-13 00:59:29 +03:00
parent 47f5dc1754 8b0649fc39
commit b98b14f810
6 changed files with 8428 additions and 23 deletions
--- a/CLAUDE_ZH.md
+++ b/CLAUDE_ZH.md
@@ -1,10 +1,12 @@
 # n8n-workflows 仓库

 ## 概述
+
 本仓库包含一系列 n8n 工作流自动化文件。n8n 是一款工作流自动化工具，可通过可视化节点界面创建复杂自动化。每个工作流以 JSON 文件形式存储，包含节点定义、连接和配置信息。

 ## 仓库结构
-```
+
+```bash
 n8n-workflows/
 ├── workflows/           # 主目录，包含所有 n8n 工作流 JSON 文件
 │   ├── *.json          # 各个工作流文件
@@ -14,7 +16,9 @@ n8n-workflows/
 ```

 ## 工作流文件格式
+
 每个工作流 JSON 文件包含：
+
 - **name**：工作流标识符
 - **nodes**：节点对象数组，定义操作
 - **connections**：定义节点连接方式的对象
@@ -24,6 +28,7 @@ n8n-workflows/
 - **createdAt/updatedAt**：时间戳

 ## 常见节点类型
+
 - **触发节点**：webhook、cron、manual
 - **集成节点**：HTTP 请求、数据库连接器、API 集成
 - **逻辑节点**：IF、Switch、Merge、Loop
@@ -33,14 +38,18 @@ n8n-workflows/
 ## 使用本仓库

 ### 分析任务建议
+
 分析本仓库工作流时：
+
 1. 解析 JSON 文件，理解工作流结构
 2. 检查节点链路，确定功能实现
 3. 识别外部集成与依赖
 4. 考虑节点连接实现的业务逻辑

 ### 文档任务建议
+
 记录工作流文档时：
+
 1. 验证现有描述与实际实现的一致性
 2. 识别触发机制和调度计划
 3. 列出所有使用的外部服务和API
@@ -48,7 +57,9 @@ n8n-workflows/
 5. 突出显示任何错误处理或重试机制

 ### 修改任务建议
+
 修改工作流时：
+
 1. 保持 JSON 结构和必要字段
 2. 维护节点 ID 的唯一性
 3. 添加/删除节点时更新连接
@@ -57,17 +68,20 @@ n8n-workflows/
 ## 关键注意事项

 ### 安全性
+
 - 工作流文件可能在 webhook URL 或 API 配置中包含敏感信息
 - 凭证通常单独存储在 n8n 中，而不在工作流文件中
 - 谨慎处理任何硬编码的值或端点

 ### 最佳实践
+
 - 工作流应有清晰、描述性的名称
 - 复杂工作流受益于文档节点或注释
 - 错误处理节点提高可靠性
 - 模块化工作流（调用子工作流）提高可维护性

 ### 常见模式
+
 - **数据管道**：触发 → 获取数据 → 转换 → 存储/发送
 - **集成同步**：定时任务 → API调用 → 比较 → 更新系统
 - **自动化**：Webhook → 处理 → 条件逻辑 → 执行操作
@@ -82,27 +96,32 @@ n8n-workflows/
 2. **文档生成**：创建解释工作流实现功能的描述，而不仅仅是包含哪些节点。

 3. **故障排除**：常见问题包括：
+
   - 节点连接不正确
   - 缺少错误处理
   - 循环中的低效数据处理
   - 应该参数化的硬编码值

 4. **优化建议**：
+
   - 识别冗余操作
   - 适用场景下建议批处理
   - 推荐添加错误处理
   - 建议拆分复杂工作流

 5. **代码生成**：创建分析这些工作流的工具时：
+
   - 处理各种 n8n 格式版本
   - 考虑自定义节点
   - 解析节点参数中的表达式
   - 考虑节点执行顺序

 ## 仓库特定信息
+
 [在此处添加有关工作流、命名约定或特殊注意事项的任何特定信息]

 ## 版本兼容性
+
 - n8n 版本：[指定这些工作流兼容的 n8n 版本]
 - 最后更新：[最后一次主要更新的日期]
 - 迁移说明：[任何特定版本的注意事项]
--- a/context/search_categories.json
+++ b/context/search_categories.json
--- a/context/unique_categories.json
+++ b/context/unique_categories.json
@@ -1,3 +1,18 @@
 [
-  "Uncategorized"
+  "AI Agent Development",
+  "Business Process Automation",
+  "CRM & Sales",
+  "Cloud Storage & File Management",
+  "Communication & Messaging",
+  "Creative Content & Video Automation",
+  "Creative Design Automation",
+  "Data Processing & Analysis",
+  "E-commerce & Retail",
+  "Financial & Accounting",
+  "Marketing & Advertising Automation",
+  "Project Management",
+  "Social Media Management",
+  "Technical Infrastructure & DevOps",
+  "Uncategorized",
+  "Web Scraping & Data Extraction"
 ]
--- a/create_categories.py
+++ b/create_categories.py
@@ -1,20 +1,20 @@
 import json
 import os
 from pathlib import Path
+import glob
+import re

 def load_def_categories():
    """Load the definition categories from def_categories.json"""
    def_categories_path = Path("context/def_categories.json")
    with open(def_categories_path, 'r', encoding='utf-8') as f:
-        categories_data = json.load(f)
-    
-    # Create a mapping from integration name (lowercase) to category
-    integration_to_category = {}
-    for item in categories_data:
-        integration = item['integration'].lower()
-        category = item['category']
-        integration_to_category[integration] = category
-    
+        raw_map = json.load(f)
+
+    # Normalize keys: strip non-alphanumerics and lowercase
+    integration_to_category = {
+        re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
+        for item in raw_map
+    }
    return integration_to_category

 def extract_tokens_from_filename(filename):
@@ -33,30 +33,134 @@ def extract_tokens_from_filename(filename):
 def find_matching_category(tokens, integration_to_category):
    """Find the first matching category for the given tokens"""
    for token in tokens:
-        if token in integration_to_category:
-            return integration_to_category[token]
+        # Normalize token same as keys
+        norm = re.sub(r"[^a-z0-9]", "", token.lower())
+        if norm in integration_to_category:
+            return integration_to_category[norm]
    
    # Try partial matches for common variations
    for token in tokens:
-        for integration in integration_to_category:
-            if token in integration or integration in token:
-                return integration_to_category[integration]
+        norm = re.sub(r"[^a-z0-9]", "", token.lower())
+        for integration_key in integration_to_category:
+            if norm in integration_key or integration_key in norm:
+                return integration_to_category[integration_key]
    
    return ""

+def categorize_by_filename(filename):
+    """
+    Categorize workflow based on filename patterns.
+    Returns the most likely category or None if uncertain.
+    """
+    filename_lower = filename.lower()
+    
+    # Security & Authentication
+    if any(word in filename_lower for word in ['totp', 'bitwarden', 'auth', 'security']):
+        return "Technical Infrastructure & DevOps"
+
+    # Data Processing & File Operations
+    if any(word in filename_lower for word in ['process', 'writebinaryfile', 'readbinaryfile', 'extractfromfile', 'converttofile', 'googlefirebasecloudfirestore', 'supabase', 'surveymonkey', 'renamekeys', 'readpdf', 'wufoo', 'splitinbatches', 'airtop', 'comparedatasets', 'spreadsheetfile']):
+        return "Data Processing & Analysis"
+
+    # Utility & Business Process Automation
+    if any(word in filename_lower for word in ['noop', 'code', 'schedule', 'filter', 'splitout', 'wait', 'limit', 'aggregate', 'acuityscheduling', 'eventbrite', 'philipshue', 'stickynote', 'n8ntrainingcustomerdatastore', 'n8n']):
+        return "Business Process Automation"
+
+    # Webhook & API related
+    if any(word in filename_lower for word in ['webhook', 'respondtowebhook', 'http', 'rssfeedread']):
+        return "Web Scraping & Data Extraction"
+
+    # Form & Data Collection
+    if any(word in filename_lower for word in ['form', 'typeform', 'jotform']):
+        return "Data Processing & Analysis"
+
+    # Local file operations
+    if any(word in filename_lower for word in ['localfile', 'filemaker']):
+        return "Cloud Storage & File Management"
+
+    # Database operations
+    if any(word in filename_lower for word in ['postgres', 'mysql', 'mongodb', 'redis', 'elasticsearch', 'snowflake']):
+        return "Data Processing & Analysis"
+
+    # AI & Machine Learning
+    if any(word in filename_lower for word in ['openai', 'awstextract', 'awsrekognition', 'humanticai', 'openthesaurus', 'googletranslate', 'summarize']):
+        return "AI Agent Development"
+
+    # E-commerce specific
+    if any(word in filename_lower for word in ['woocommerce', 'gumroad']):
+        return "E-commerce & Retail"
+
+    # Social media specific
+    if any(word in filename_lower for word in ['facebook', 'linkedin', 'instagram']):
+        return "Social Media Management"
+
+    # Customer support
+    if any(word in filename_lower for word in ['zendesk', 'intercom', 'drift', 'pagerduty']):
+        return "Communication & Messaging"
+
+    # Analytics & Tracking
+    if any(word in filename_lower for word in ['googleanalytics', 'segment', 'mixpanel']):
+        return "Data Processing & Analysis"
+
+    # Development tools
+    if any(word in filename_lower for word in ['git', 'github', 'gitlab', 'travisci', 'jenkins', 'uptimerobot', 'gsuiteadmin', 'debughelper', 'bitbucket']):
+        return "Technical Infrastructure & DevOps"
+
+    # CRM & Sales tools
+    if any(word in filename_lower for word in ['pipedrive', 'hubspot', 'salesforce', 'copper', 'orbit', 'agilecrm']):
+        return "CRM & Sales"
+
+    # Marketing tools
+    if any(word in filename_lower for word in ['mailchimp', 'convertkit', 'sendgrid', 'mailerlite', 'lemlist', 'sendy', 'postmark', 'mailgun']):
+        return "Marketing & Advertising Automation"
+
+    # Project management
+    if any(word in filename_lower for word in ['asana', 'mondaycom', 'clickup', 'trello', 'notion', 'toggl', 'microsofttodo', 'calendly', 'jira']):
+        return "Project Management"
+
+    # Communication
+    if any(word in filename_lower for word in ['slack', 'telegram', 'discord', 'mattermost', 'twilio', 'emailreadimap', 'teams', 'gotowebinar']):
+        return "Communication & Messaging"
+
+    # Cloud storage
+    if any(word in filename_lower for word in ['dropbox', 'googledrive', 'onedrive', 'awss3', 'googledocs']):
+        return "Cloud Storage & File Management"
+
+    # Creative tools
+    if any(word in filename_lower for word in ['canva', 'figma', 'bannerbear', 'editimage']):
+        return "Creative Design Automation"
+
+    # Video & content
+    if any(word in filename_lower for word in ['youtube', 'vimeo', 'storyblok', 'strapi']):
+        return "Creative Content & Video Automation"
+
+    # Financial tools
+    if any(word in filename_lower for word in ['stripe', 'chargebee', 'quickbooks', 'harvest']):
+        return "Financial & Accounting"
+
+    # Weather & external APIs
+    if any(word in filename_lower for word in ['openweathermap', 'nasa', 'crypto', 'coingecko']):
+        return "Web Scraping & Data Extraction"
+
+    return ""
+
 def main():
    # Load definition categories
    integration_to_category = load_def_categories()
    
    # Get all JSON files from workflows directory
    workflows_dir = Path("workflows")
-    json_files = list(workflows_dir.glob("*.json"))
+    json_files = glob.glob(
+        os.path.join(workflows_dir, "**", "*.json"),
+        recursive=True
+    ) 
    
    # Process each file
    search_categories = []
    
    for json_file in json_files:
-        filename = json_file.name
+        path_obj = Path(json_file)
+        filename = path_obj.name
        tokens = extract_tokens_from_filename(filename)
        category = find_matching_category(tokens, integration_to_category)
        
@@ -64,6 +168,11 @@ def main():
            "filename": filename,
            "category": category
        })
+
+    # Second pass for categorization
+    for item in search_categories:
+        if not item['category']:
+            item['category'] = categorize_by_filename(item['filename'])
    
    # Sort by filename for consistency
    search_categories.sort(key=lambda x: x['filename'])
@@ -136,4 +245,4 @@ def main():
    print("="*50)

 if __name__ == "__main__":
-    main()
+    main()
--- a/import_workflows.py
+++ b/import_workflows.py
@@ -10,6 +10,21 @@ import sys
 from pathlib import Path
 from typing import List, Dict, Any

+from categorize_workflows import categorize_by_filename
+
+
+def load_categories():
+    """Load the search categories file."""
+    try:
+        with open('context/search_categories.json', 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError):
+        return []
+
+def save_categories(data):
+    """Save the search categories file."""
+    with open('context/search_categories.json', 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)

 class WorkflowImporter:
    """Import n8n workflows with progress tracking and error handling."""
@@ -56,6 +71,32 @@ class WorkflowImporter:
            
            if result.returncode == 0:
                print(f"✅ Imported: {file_path.name}")
+                
+                # Categorize the workflow and update search_categories.json
+                suggested_category = categorize_by_filename(file_path.name)
+                
+                all_workflows_data = load_categories()
+                
+                found = False
+                for workflow_entry in all_workflows_data:
+                    if workflow_entry.get('filename') == file_path.name:
+                        workflow_entry['category'] = suggested_category
+                        found = True
+                        break
+                
+                if not found:
+                    # Add new workflow entry if not found (e.g., first import)
+                    all_workflows_data.append({
+                        "filename": file_path.name,
+                        "category": suggested_category,
+                        "name": file_path.stem, # Assuming workflow name is filename without extension
+                        "description": "", # Placeholder, can be updated manually
+                        "nodes": [] # Placeholder, can be updated manually
+                    })
+                
+                save_categories(all_workflows_data)
+                print(f"  Categorized '{file_path.name}' as '{suggested_category or 'Uncategorized'}'")
+                
                return True
            else:
                error_msg = result.stderr.strip() or result.stdout.strip()
@@ -141,6 +182,7 @@ def check_n8n_available() -> bool:

 def main():
    """Main entry point."""
+    sys.stdout.reconfigure(encoding='utf-8')
    print("🔧 N8N Workflow Importer")
    print("=" * 40)
    
--- a/workflow_db.py
+++ b/workflow_db.py
@@ -434,10 +434,9 @@ class WorkflowDatabase:
        if not os.path.exists(self.workflows_dir):
            print(f"Warning: Workflows directory '{self.workflows_dir}' not found.")
            return {'processed': 0, 'skipped': 0, 'errors': 0}
-
+        
        workflows_path = Path(self.workflows_dir)
-        json_files = list(workflows_path.rglob("*.json"))
-        # json_files = glob.glob(os.path.join(self.workflows_dir, "*.json"), recursive=True)
+        json_files = [str(p) for p in workflows_path.rglob("*.json")]
        
        if not json_files:
            print(f"Warning: No JSON files found in '{self.workflows_dir}' directory.")