From 8ccd04fca2767b4917bf254c01edbf5c319f1093 Mon Sep 17 00:00:00 2001 From: Nathan Windisch Date: Wed, 24 Jul 2024 22:39:14 +0100 Subject: [PATCH] Added a base RedditPage class which can extract data from a Reddit .json link, and returns a RedditPageData object --- src/reddit.py | 74 ++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/src/reddit.py b/src/reddit.py index f0e35e8..5086510 100644 --- a/src/reddit.py +++ b/src/reddit.py @@ -33,48 +33,44 @@ class RedditPageData: def __str__(self): return json.dumps({ "uri": self.uri, "posts": self.posts, "comments": self.comments, "after": self.after }) - "score": item["score"], - "upvotes": item["ups"], - "downvotes": item["downs"], - "upvote_ratio": item["upvote_ratio"], - "total_comments": item["num_comments"], - "total_crossposts": item["num_crossposts"], - "total_awards": item["total_awards_received"], - "domain": item["domain"], - "flair_text": item["link_flair_text"], - "media_embed": item["media_embed"], +class RedditPage: + def __init__(self, id: str, page_type: RedditPageType, after: str | None = None): + self.base_uri = "https://old.reddit.com" + self.id = id + self.page_type = page_type + self.after = "" + self.children = [] + self.posts = [] + self.comments = [] + self.after = after + + def __str__(self): return json.dumps(RedditPageData(uri=self.get_uri(), posts=self.posts, comments=self.comments, after=self.after)) + def get_uri(self): + uri = f"{self.base_uri}/{self.page_type.value}/{self.id}.json" + if self.after: uri += f"?after={self.after}" + return uri + def get_data(self): return { "posts": self.posts, "comments": self.comments, "after": self.after } + def get_page(self): + response = requests.get(self.get_uri()) + if not response.ok: return RedditPageData({ "posts": [], "comments": [], "after": "" }) + raw_data = json.loads(response.content) + is_comments = self.page_type.name.endswith("_COMMENTS") - "is_pinned": item["pinned"], - "is_self": item["is_self"], - "is_video": item["is_video"], - "is_media_only": item["media_only"], - "is_over_18": item["over_18"], - "is_edited": item["edited"], - "is_hidden": item["hidden"], - "is_archived": item["archived"], - "is_locked": item["locked"], - "is_quarantined": item["quarantine"], - "is_spoiler": item["spoiler"], - "is_stickied": item["stickied"], - "is_send_replies": item["send_replies"], - - "published_at": item["created_utc"], - }) - return posts + if is_comments: + for i in range(0, 1): self.extract_children(raw_data[i]) + self.after = None + else: + self.extract_children(raw_data) + try: self.after = raw_data["data"]["after"] + except: None + + return RedditPageData(posts=self.posts, comments=self.comments, after=self.after) -def parse_comments(data: list): - comments = [] - for item in data: - if item["kind"] != "t1": continue # skip non-comment items - item = item["data"] - comments.append({ - "id": item["name"], - "body": item["body"], - "link": item["permalink"], - "post_id": item["link_id"], - "post_title": item["link_title"], - "post_link": item["link_permalink"], + def extract_children(self, data): + if "data" in data and "children" in data["data"]: + for item in data["data"]["children"]: self.children.append(item) + "author_username": item["author"], "author_id": item["author_fullname"],