fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82)
fix: recursive indexing and normalization in create_categories.py & workflow_db.py (fixes #82)
This commit is contained in:
@@ -1,20 +1,20 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import glob
|
||||||
|
import re
|
||||||
|
|
||||||
def load_def_categories():
|
def load_def_categories():
|
||||||
"""Load the definition categories from def_categories.json"""
|
"""Load the definition categories from def_categories.json"""
|
||||||
def_categories_path = Path("context/def_categories.json")
|
def_categories_path = Path("context/def_categories.json")
|
||||||
with open(def_categories_path, 'r', encoding='utf-8') as f:
|
with open(def_categories_path, 'r', encoding='utf-8') as f:
|
||||||
categories_data = json.load(f)
|
raw_map = json.load(f)
|
||||||
|
|
||||||
# Create a mapping from integration name (lowercase) to category
|
|
||||||
integration_to_category = {}
|
|
||||||
for item in categories_data:
|
|
||||||
integration = item['integration'].lower()
|
|
||||||
category = item['category']
|
|
||||||
integration_to_category[integration] = category
|
|
||||||
|
|
||||||
|
# Normalize keys: strip non-alphanumerics and lowercase
|
||||||
|
integration_to_category = {
|
||||||
|
re.sub(r"[^a-z0-9]", "", item["integration"].lower()): item["category"]
|
||||||
|
for item in raw_map
|
||||||
|
}
|
||||||
return integration_to_category
|
return integration_to_category
|
||||||
|
|
||||||
def extract_tokens_from_filename(filename):
|
def extract_tokens_from_filename(filename):
|
||||||
@@ -33,14 +33,17 @@ def extract_tokens_from_filename(filename):
|
|||||||
def find_matching_category(tokens, integration_to_category):
|
def find_matching_category(tokens, integration_to_category):
|
||||||
"""Find the first matching category for the given tokens"""
|
"""Find the first matching category for the given tokens"""
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if token in integration_to_category:
|
# Normalize token same as keys
|
||||||
return integration_to_category[token]
|
norm = re.sub(r"[^a-z0-9]", "", token.lower())
|
||||||
|
if norm in integration_to_category:
|
||||||
|
return integration_to_category[norm]
|
||||||
|
|
||||||
# Try partial matches for common variations
|
# Try partial matches for common variations
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
for integration in integration_to_category:
|
norm = re.sub(r"[^a-z0-9]", "", token.lower())
|
||||||
if token in integration or integration in token:
|
for integration_key in integration_to_category:
|
||||||
return integration_to_category[integration]
|
if norm in integration_key or integration_key in norm:
|
||||||
|
return integration_to_category[integration_key]
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@@ -50,13 +53,17 @@ def main():
|
|||||||
|
|
||||||
# Get all JSON files from workflows directory
|
# Get all JSON files from workflows directory
|
||||||
workflows_dir = Path("workflows")
|
workflows_dir = Path("workflows")
|
||||||
json_files = list(workflows_dir.glob("*.json"))
|
json_files = glob.glob(
|
||||||
|
os.path.join(workflows_dir, "**", "*.json"),
|
||||||
|
recursive=True
|
||||||
|
)
|
||||||
|
|
||||||
# Process each file
|
# Process each file
|
||||||
search_categories = []
|
search_categories = []
|
||||||
|
|
||||||
for json_file in json_files:
|
for json_file in json_files:
|
||||||
filename = json_file.name
|
path_obj = Path(json_file)
|
||||||
|
filename = path_obj.name
|
||||||
tokens = extract_tokens_from_filename(filename)
|
tokens = extract_tokens_from_filename(filename)
|
||||||
category = find_matching_category(tokens, integration_to_category)
|
category = find_matching_category(tokens, integration_to_category)
|
||||||
|
|
||||||
|
|||||||
@@ -436,8 +436,7 @@ class WorkflowDatabase:
|
|||||||
return {'processed': 0, 'skipped': 0, 'errors': 0}
|
return {'processed': 0, 'skipped': 0, 'errors': 0}
|
||||||
|
|
||||||
workflows_path = Path(self.workflows_dir)
|
workflows_path = Path(self.workflows_dir)
|
||||||
json_files = list(workflows_path.rglob("*.json"))
|
json_files = [str(p) for p in workflows_path.rglob("*.json")]
|
||||||
# json_files = glob.glob(os.path.join(self.workflows_dir, "*.json"), recursive=True)
|
|
||||||
|
|
||||||
if not json_files:
|
if not json_files:
|
||||||
print(f"Warning: No JSON files found in '{self.workflows_dir}' directory.")
|
print(f"Warning: No JSON files found in '{self.workflows_dir}' directory.")
|
||||||
|
|||||||
Reference in New Issue
Block a user