Initial commit.

2024-07-22 02:20:35 +01:00 · 2024-07-22 02:20:35 +01:00 · 1198424f4f
commit 1198424f4f
4 changed files with 164 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+__pycache__/
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
+# open-webui tool: bbc news feeds
+
+A tool to interact with bbc.co.uk/news feeds, including top stories, world news, UK/regional news, and more.
--- a/src/bbc_news.py
+++ b/src/bbc_news.py
@ -0,0 +1,148 @@
+"""
+title: BBC News Feeds
+author: @nathanwindisch
+author_url: https://github.com/nathanwindisch
+funding_url: https://www.patreon.com/NathanWindisch
+version: 0.0.9
+changelog:
+- 0.0.1 - Initial upload to openwebui community.
+- 0.0.2 - Modified formatting slightly.
+- 0.0.3 - Added tool docstring, and this changelog.
+- 0.0.4 - Added funding_url to docstring.
+- 0.0.5 - Updated get_bbc_news_feed function to use a default for 
+          the ArticleType, and updated it's docstring to include 
+          a list of the possible types, to assist the LLM's query.
+- 0.0.6 - Added event emitter to the get_bbc_news_feed function, 
+          to provide status updates to the user as the function 
+          executes. Also wrapped the function in a try/catch, to 
+          handle any exceptions that may occur during execution.
+- 0.0.7 - Fixed a major bug where the type was not being casted 
+          to the ArticleType enum, causing the get_uri function 
+          to not be called correctly.
+- 0.0.8 - Updated the ArticleType parameter docstring to make it 
+          mandatory, and for to contain the full names of the
+          'world/' types rather than the abbreviations.
+- 0.0.9 - Created a new function, get_bbc_news_content, which 
+          retrieves the article text content of a BBC News link, 
+          given it's URI.
+"""
+
+import re
+import json
+import requests
+import xml.etree.ElementTree as ElementTree
+from typing import Awaitable, Callable
+from pydantic import BaseModel
+from enum import Enum
+from bs4 import BeautifulSoup
+
+
+class ArticleType(Enum):
+	top_stories = ""
+	world = "world"
+	uk = "uk"
+	business = "business"
+	politics = "politics"
+	health = "health"
+	education = "education"
+	science_and_environment = "science_and_environment"
+	technology = "technology"
+	entertainment_and_arts = "entertainment_and_arts"
+	england = "england"
+	northern_ireland = "northern_ireland"
+	scotland = "scotland"
+	wales = "wales"
+	africa = "world/africa"
+	asia = "world/asia"
+	australia = "world/australia"
+	europe = "world/europe"
+	latin_america = "world/latin_america"
+	middle_east = "world/middle_east"
+	us_and_canada = "world/us_and_canada"
+	def get_name(self) -> str: return self.name.replace("_", " ").title()
+	def get_uri(self) -> str: return f"https://feeds.bbci.co.uk/news/{self.value}/rss.xml" if self.name != "top_stories" else "https://feeds.bbci.co.uk/news/rss.xml"
+
+# Regex to match a BBC News article URI.
+# Details:
+#  - Must use http or https.
+#  - Must be a bbc.com or bbc.co.uk domain.
+#  - Must be a news article or video.
+#  - Must have a valid ID (alphanumeric characters).
+URI_REGEX = re.compile("^(https?:\/\/)(www\.)?bbc\.(com|co\.uk)\/news\/(articles|videos)\/\w+$")
+
+class Tools:
+	def __init__(self): pass
+	class UserValves(BaseModel): pass
+
+
+	async def get_bbc_news_feed(
+        self,
+        type: ArticleType,
+        __event_emitter__: Callable[[dict], Awaitable[None]],
+        __user__: dict = {},
+    ) -> str:
+		"""
+		Get the latest news from the BBC, as an array of JSON objects with a title, description, link, and published date.
+		:param type: The type of news to get. It can be any of the ArticleType enum values (world, uk, business, politics, health, education, science_and_environment, technology, entertainment_and_arts, england, northern_ireland, scotland, wales, world/africa, world/asia, world/australia, world/europe, world/latin_america, world/middle_east, world/us_and_canada).
+		:return: A list of news items or an error message.
+		"""
+		await __event_emitter__({ "data": { "description": f"Starting BBC News Feed retrieval for articles in the '{type.get_name()}' category...", "status": "in_progress", "done": False }, "type": "status" })
+		type = ArticleType(type) # Enforce the type (it seems to get dropped by openwebui...)
+		output = []
+		try:
+			response = requests.get(type.get_uri())
+			if not response.ok: return f"Error: '{type}' ({type.get_uri()}) not found ({response.status_code})"
+			root = ElementTree.fromstring(response.content)
+			for item in root.iter("item"): output.append({
+				"title": item.find("title").text,
+				"description": item.find("description").text,
+				"link": item.find("link").text,
+				"published": item.find("pubDate").text,
+			})
+			await __event_emitter__({ "data": { "description": f"Retrieved {len(output)} news items from BBC News Feed for articles in the '{type.get_name()}' category.", "status": "complete", "done": True }, "type": "status" })
+		except Exception as e:
+			await __event_emitter__({ "data": { "description": f"Failed to retrieved any news items from BBC News Feed for articles in the '{type.get_name()}' ({type.get_uri()}) category: {e}.", "status": "complete", "done": True }, "type": "status" })
+			return f"Error: {e}"
+		
+		return json.dumps(output)
+		
+
+	async def get_bbc_news_content(
+		self,
+		uri: str,
+		__event_emitter__: Callable[[dict], Awaitable[None]],
+		__user__: dict = {},
+	) -> str:
+		"""
+		Get the content of a news article from the BBC.
+		:param uri: The URI of the article to get the content of, which should start with https://bbc.com/news or https://bbc.co.uk/news.
+		:return: The content of the article or an error message.
+		"""
+		await __event_emitter__({ "data": { "description": f"Starting BBC News Article retrieval from '{uri}'...", "status": "in_progress", "done": False }, "type": "status" })
+
+		if uri == "":
+			await __event_emitter__({ "data": { "description": f"Error: No URI provided.", "status": "complete", "done": True }, "type": "status" })
+			return "Error: No URI provided"
+
+		if not re.match(URI_REGEX, uri):
+			await __event_emitter__({ "data": { "description": f"Error: URI must be a BBC News article.", "status": "complete", "done": True }, "type": "status" })
+			return "Error: URI must be a BBC News article."
+
+		content = ""
+		try:
+			response = requests.get(uri)
+			if not response.ok: return f"Error: '{uri}' not found ({response.status_code})"
+			article = BeautifulSoup(response.content, "html.parser").find("article")
+			if article is None:
+				await __event_emitter__({ "data": { "description": f"Failed to retrieve BBC News Article content from '{uri}': Article content not found.", "status": "complete", "done": True }, "type": "status" })
+				return f"Error: Article content for {uri} not found."
+			
+			paragraphs = article.find_all("p")
+			for paragraph in paragraphs: content += f"{paragraph.text}\n"
+			await __event_emitter__({ "data": { "description": f"Retrieved BBC News Article content from '{uri}' ({len(content)} characters).", "status": "complete", "done": True }, "type": "status" })
+		except Exception as e:
+			await __event_emitter__({ "data": { "description": f"Failed to retrieve BBC News Article content from '{uri}': {e}.", "status": "complete", "done": True }, "type": "status" })
+			return f"Error: {e}"
+
+		return content
+
--- a/test/bbc_news.test.py
+++ b/test/bbc_news.test.py
@ -0,0 +1,12 @@
+# First, we need to add the src to the path so we can import the class
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src")))
+# Unfortunately, we don't get any type hinting, but this does work :)
+from bbc_news import Tools, ArticleType
+
+async def main():
+	async def mock_event_emitter(event: dict): print("Event Emitted:", event["data"])
+	await Tools().get_bbc_news_feed(ArticleType.top_stories, mock_event_emitter)
+
+import asyncio
+if __name__ == "__main__": asyncio.run(main())