fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82)

fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82)
This commit is contained in:
Praveen Mudalgeri
2025-08-12 10:34:22 +05:30
parent e2fa5a7505
commit 4c61958fbc
2 changed files with 25 additions and 19 deletions

View File

@@ -1,20 +1,20 @@
import json import json
import os import os
from pathlib import Path from pathlib import Path
import glob
import re
def load_def_categories(): def load_def_categories():
"""Load the definition categories from def_categories.json""" """Load the definition categories from def_categories.json"""
def_categories_path = Path("context/def_categories.json") def_categories_path = Path("context/def_categories.json")
with open(def_categories_path, 'r', encoding='utf-8') as f: with open(def_categories_path, 'r', encoding='utf-8') as f:
categories_data = json.load(f) raw_map = json.load(f)
# Create a mapping from integration name (lowercase) to category # Normalize keys: strip non-alphanumerics and lowercase
integration_to_category = {} integration_to_category = {
for item in categories_data: re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
integration = item['integration'].lower() for item in raw_map
category = item['category'] }
integration_to_category[integration] = category
return integration_to_category return integration_to_category
def extract_tokens_from_filename(filename): def extract_tokens_from_filename(filename):
@@ -33,14 +33,17 @@ def extract_tokens_from_filename(filename):
def find_matching_category(tokens, integration_to_category): def find_matching_category(tokens, integration_to_category):
"""Find the first matching category for the given tokens""" """Find the first matching category for the given tokens"""
for token in tokens: for token in tokens:
if token in integration_to_category: # Normalize token same as keys
return integration_to_category[token] norm = re.sub(r"[^a-z0-9]", "", token.lower())
if norm in integration_to_category:
return integration_to_category[norm]
# Try partial matches for common variations # Try partial matches for common variations
for token in tokens: for token in tokens:
for integration in integration_to_category: norm = re.sub(r"[^a-z0-9]", "", token.lower())
if token in integration or integration in token: for integration_key in integration_to_category:
return integration_to_category[integration] if norm in integration_key or integration_key in norm:
return integration_to_category[integration_key]
return "" return ""
@@ -50,13 +53,17 @@ def main():
# Get all JSON files from workflows directory # Get all JSON files from workflows directory
workflows_dir = Path("workflows") workflows_dir = Path("workflows")
json_files = list(workflows_dir.glob("*.json")) json_files = glob.glob(
os.path.join(workflows_dir, "**", "*.json"),
recursive=True
)
# Process each file # Process each file
search_categories = [] search_categories = []
for json_file in json_files: for json_file in json_files:
filename = json_file.name path_obj = Path(json_file)
filename = path_obj.name
tokens = extract_tokens_from_filename(filename) tokens = extract_tokens_from_filename(filename)
category = find_matching_category(tokens, integration_to_category) category = find_matching_category(tokens, integration_to_category)

View File

@@ -434,10 +434,9 @@ class WorkflowDatabase:
if not os.path.exists(self.workflows_dir): if not os.path.exists(self.workflows_dir):
print(f"Warning: Workflows directory '{self.workflows_dir}' not found.") print(f"Warning: Workflows directory '{self.workflows_dir}' not found.")
return {'processed': 0, 'skipped': 0, 'errors': 0} return {'processed': 0, 'skipped': 0, 'errors': 0}
workflows_path = Path(self.workflows_dir) workflows_path = Path(self.workflows_dir)
json_files = list(workflows_path.rglob("*.json")) json_files = [str(p) for p in workflows_path.rglob("*.json")]
# json_files = glob.glob(os.path.join(self.workflows_dir, "*.json"), recursive=True)
if not json_files: if not json_files:
print(f"Warning: No JSON files found in '{self.workflows_dir}' directory.") print(f"Warning: No JSON files found in '{self.workflows_dir}' directory.")