From 1198424f4fe53c4eab71b46610cc7b09317367cd Mon Sep 17 00:00:00 2001 From: Nathan Windisch Date: Mon, 22 Jul 2024 02:20:35 +0100 Subject: [PATCH] Initial commit. --- .gitignore | 1 + README.md | 3 + src/bbc_news.py | 148 ++++++++++++++++++++++++++++++++++++++++++ test/bbc_news.test.py | 12 ++++ 4 files changed, 164 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 src/bbc_news.py create mode 100644 test/bbc_news.test.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba0430d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5cbbcb7 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# open-webui tool: bbc news feeds + +A tool to interact with bbc.co.uk/news feeds, including top stories, world news, UK/regional news, and more. \ No newline at end of file diff --git a/src/bbc_news.py b/src/bbc_news.py new file mode 100644 index 0000000..3e3d5cd --- /dev/null +++ b/src/bbc_news.py @@ -0,0 +1,148 @@ +""" +title: BBC News Feeds +author: @nathanwindisch +author_url: https://github.com/nathanwindisch +funding_url: https://www.patreon.com/NathanWindisch +version: 0.0.9 +changelog: +- 0.0.1 - Initial upload to openwebui community. +- 0.0.2 - Modified formatting slightly. +- 0.0.3 - Added tool docstring, and this changelog. +- 0.0.4 - Added funding_url to docstring. +- 0.0.5 - Updated get_bbc_news_feed function to use a default for + the ArticleType, and updated it's docstring to include + a list of the possible types, to assist the LLM's query. +- 0.0.6 - Added event emitter to the get_bbc_news_feed function, + to provide status updates to the user as the function + executes. Also wrapped the function in a try/catch, to + handle any exceptions that may occur during execution. +- 0.0.7 - Fixed a major bug where the type was not being casted + to the ArticleType enum, causing the get_uri function + to not be called correctly. +- 0.0.8 - Updated the ArticleType parameter docstring to make it + mandatory, and for to contain the full names of the + 'world/' types rather than the abbreviations. +- 0.0.9 - Created a new function, get_bbc_news_content, which + retrieves the article text content of a BBC News link, + given it's URI. +""" + +import re +import json +import requests +import xml.etree.ElementTree as ElementTree +from typing import Awaitable, Callable +from pydantic import BaseModel +from enum import Enum +from bs4 import BeautifulSoup + + +class ArticleType(Enum): + top_stories = "" + world = "world" + uk = "uk" + business = "business" + politics = "politics" + health = "health" + education = "education" + science_and_environment = "science_and_environment" + technology = "technology" + entertainment_and_arts = "entertainment_and_arts" + england = "england" + northern_ireland = "northern_ireland" + scotland = "scotland" + wales = "wales" + africa = "world/africa" + asia = "world/asia" + australia = "world/australia" + europe = "world/europe" + latin_america = "world/latin_america" + middle_east = "world/middle_east" + us_and_canada = "world/us_and_canada" + def get_name(self) -> str: return self.name.replace("_", " ").title() + def get_uri(self) -> str: return f"https://feeds.bbci.co.uk/news/{self.value}/rss.xml" if self.name != "top_stories" else "https://feeds.bbci.co.uk/news/rss.xml" + +# Regex to match a BBC News article URI. +# Details: +# - Must use http or https. +# - Must be a bbc.com or bbc.co.uk domain. +# - Must be a news article or video. +# - Must have a valid ID (alphanumeric characters). +URI_REGEX = re.compile("^(https?:\/\/)(www\.)?bbc\.(com|co\.uk)\/news\/(articles|videos)\/\w+$") + +class Tools: + def __init__(self): pass + class UserValves(BaseModel): pass + + + async def get_bbc_news_feed( + self, + type: ArticleType, + __event_emitter__: Callable[[dict], Awaitable[None]], + __user__: dict = {}, + ) -> str: + """ + Get the latest news from the BBC, as an array of JSON objects with a title, description, link, and published date. + :param type: The type of news to get. It can be any of the ArticleType enum values (world, uk, business, politics, health, education, science_and_environment, technology, entertainment_and_arts, england, northern_ireland, scotland, wales, world/africa, world/asia, world/australia, world/europe, world/latin_america, world/middle_east, world/us_and_canada). + :return: A list of news items or an error message. + """ + await __event_emitter__({ "data": { "description": f"Starting BBC News Feed retrieval for articles in the '{type.get_name()}' category...", "status": "in_progress", "done": False }, "type": "status" }) + type = ArticleType(type) # Enforce the type (it seems to get dropped by openwebui...) + output = [] + try: + response = requests.get(type.get_uri()) + if not response.ok: return f"Error: '{type}' ({type.get_uri()}) not found ({response.status_code})" + root = ElementTree.fromstring(response.content) + for item in root.iter("item"): output.append({ + "title": item.find("title").text, + "description": item.find("description").text, + "link": item.find("link").text, + "published": item.find("pubDate").text, + }) + await __event_emitter__({ "data": { "description": f"Retrieved {len(output)} news items from BBC News Feed for articles in the '{type.get_name()}' category.", "status": "complete", "done": True }, "type": "status" }) + except Exception as e: + await __event_emitter__({ "data": { "description": f"Failed to retrieved any news items from BBC News Feed for articles in the '{type.get_name()}' ({type.get_uri()}) category: {e}.", "status": "complete", "done": True }, "type": "status" }) + return f"Error: {e}" + + return json.dumps(output) + + + async def get_bbc_news_content( + self, + uri: str, + __event_emitter__: Callable[[dict], Awaitable[None]], + __user__: dict = {}, + ) -> str: + """ + Get the content of a news article from the BBC. + :param uri: The URI of the article to get the content of, which should start with https://bbc.com/news or https://bbc.co.uk/news. + :return: The content of the article or an error message. + """ + await __event_emitter__({ "data": { "description": f"Starting BBC News Article retrieval from '{uri}'...", "status": "in_progress", "done": False }, "type": "status" }) + + if uri == "": + await __event_emitter__({ "data": { "description": f"Error: No URI provided.", "status": "complete", "done": True }, "type": "status" }) + return "Error: No URI provided" + + if not re.match(URI_REGEX, uri): + await __event_emitter__({ "data": { "description": f"Error: URI must be a BBC News article.", "status": "complete", "done": True }, "type": "status" }) + return "Error: URI must be a BBC News article." + + content = "" + try: + response = requests.get(uri) + if not response.ok: return f"Error: '{uri}' not found ({response.status_code})" + article = BeautifulSoup(response.content, "html.parser").find("article") + if article is None: + await __event_emitter__({ "data": { "description": f"Failed to retrieve BBC News Article content from '{uri}': Article content not found.", "status": "complete", "done": True }, "type": "status" }) + return f"Error: Article content for {uri} not found." + + paragraphs = article.find_all("p") + for paragraph in paragraphs: content += f"{paragraph.text}\n" + await __event_emitter__({ "data": { "description": f"Retrieved BBC News Article content from '{uri}' ({len(content)} characters).", "status": "complete", "done": True }, "type": "status" }) + except Exception as e: + await __event_emitter__({ "data": { "description": f"Failed to retrieve BBC News Article content from '{uri}': {e}.", "status": "complete", "done": True }, "type": "status" }) + return f"Error: {e}" + + return content + diff --git a/test/bbc_news.test.py b/test/bbc_news.test.py new file mode 100644 index 0000000..780a384 --- /dev/null +++ b/test/bbc_news.test.py @@ -0,0 +1,12 @@ +# First, we need to add the src to the path so we can import the class +import sys, os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../src"))) +# Unfortunately, we don't get any type hinting, but this does work :) +from bbc_news import Tools, ArticleType + +async def main(): + async def mock_event_emitter(event: dict): print("Event Emitted:", event["data"]) + await Tools().get_bbc_news_feed(ArticleType.top_stories, mock_event_emitter) + +import asyncio +if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file