import llm import html import httpx import json import re from typing import Dict, List, Any @llm.hookimpl def register_fragment_loaders(register): register("hn", hacker_news_loader) def hacker_news_loader(argument: str) -> llm.Fragment: try: response = httpx.get(f"https://hn.algolia.com/api/v1/items/{argument}") response.raise_for_status() data = response.json() except Exception as ex: raise ValueError(f"Could not load Hacker News {argument}: {str(ex)}") return llm.Fragment( process_hn_comments(data), source=f"https://news.ycombinator.com/item?id={argument}", ) def convert_hn_to_thread_path( json_data: Dict[str, Any], path: str = "", result: List[str] = None ) -> List[str]: """ Convert Hacker News JSON hierarchy to thread path notation. Args: json_data: The JSON data from HN API path: Current thread path (used in recursion) result: Accumulator for formatted comments Returns: List of formatted comments in thread path notation """ if result is None: result = [] # Handle root node if not path: # Root node needs special handling as it might not be in the standard format if "text" in json_data: # It's a comment comment_text = clean_html_content(json_data.get("text", "")) result.append(f"[1] {json_data.get('author', 'Anonymous')}: {comment_text}") current_path = "1" else: # It's a story or top-level item title = json_data.get("title", "") result.append(f"[1] {json_data.get('author', 'Anonymous')}: {title}") current_path = "1" else: current_path = path # Process children recursively if "children" in json_data and json_data["children"]: for i, child in enumerate(json_data["children"], 1): child_path = f"{current_path}.{i}" if current_path else f"{i}" # Handle the comment text, unescape HTML entities and clean HTML comment_text = clean_html_content(child.get("text", "")) result.append( f"[{child_path}] {child.get('author', 'Anonymous')}: {comment_text}" ) # Process this child's children convert_hn_to_thread_path(child, child_path, result) return result def clean_html_content(text: str) -> str: """ Clean HTML content by unescaping entities and removing HTML tags. Args: text: HTML text to clean Returns: Cleaned text """ if not text: return "" # First unescape HTML entities text = html.unescape(text) # Replace paragraph tags with newlines text = text.replace("
", "\n").replace("
", "") # Remove link tags but keep the link text text = re.sub(r"]*>(.*?)", r"\1", text) # Remove all other HTML tags text = re.sub(r"<[^>]*>", "", text) return text def process_hn_comments(json_str: str) -> str: """ Process the JSON string from HN API and return thread path notation. Args: json_str: JSON string from HN API Returns: Formatted string with thread path notation """ try: json_data = json.loads(json_str) if isinstance(json_str, str) else json_str formatted_comments = convert_hn_to_thread_path(json_data) return "\n\n".join(formatted_comments) except Exception as e: return f"Error processing JSON: {str(e)}"