Added a base RedditPage class which can extract data from a Reddit .json link, and returns a RedditPageData object
This commit is contained in:
parent
6072d70aea
commit
8ccd04fca2
@ -33,48 +33,44 @@ class RedditPageData:
|
|||||||
def __str__(self): return json.dumps({ "uri": self.uri, "posts": self.posts, "comments": self.comments, "after": self.after })
|
def __str__(self): return json.dumps({ "uri": self.uri, "posts": self.posts, "comments": self.comments, "after": self.after })
|
||||||
|
|
||||||
|
|
||||||
"score": item["score"],
|
class RedditPage:
|
||||||
"upvotes": item["ups"],
|
def __init__(self, id: str, page_type: RedditPageType, after: str | None = None):
|
||||||
"downvotes": item["downs"],
|
self.base_uri = "https://old.reddit.com"
|
||||||
"upvote_ratio": item["upvote_ratio"],
|
self.id = id
|
||||||
"total_comments": item["num_comments"],
|
self.page_type = page_type
|
||||||
"total_crossposts": item["num_crossposts"],
|
self.after = ""
|
||||||
"total_awards": item["total_awards_received"],
|
self.children = []
|
||||||
"domain": item["domain"],
|
self.posts = []
|
||||||
"flair_text": item["link_flair_text"],
|
self.comments = []
|
||||||
"media_embed": item["media_embed"],
|
self.after = after
|
||||||
|
|
||||||
"is_pinned": item["pinned"],
|
def __str__(self): return json.dumps(RedditPageData(uri=self.get_uri(), posts=self.posts, comments=self.comments, after=self.after))
|
||||||
"is_self": item["is_self"],
|
def get_uri(self):
|
||||||
"is_video": item["is_video"],
|
uri = f"{self.base_uri}/{self.page_type.value}/{self.id}.json"
|
||||||
"is_media_only": item["media_only"],
|
if self.after: uri += f"?after={self.after}"
|
||||||
"is_over_18": item["over_18"],
|
return uri
|
||||||
"is_edited": item["edited"],
|
def get_data(self): return { "posts": self.posts, "comments": self.comments, "after": self.after }
|
||||||
"is_hidden": item["hidden"],
|
def get_page(self):
|
||||||
"is_archived": item["archived"],
|
response = requests.get(self.get_uri())
|
||||||
"is_locked": item["locked"],
|
if not response.ok: return RedditPageData({ "posts": [], "comments": [], "after": "" })
|
||||||
"is_quarantined": item["quarantine"],
|
raw_data = json.loads(response.content)
|
||||||
"is_spoiler": item["spoiler"],
|
is_comments = self.page_type.name.endswith("_COMMENTS")
|
||||||
"is_stickied": item["stickied"],
|
|
||||||
"is_send_replies": item["send_replies"],
|
|
||||||
|
|
||||||
"published_at": item["created_utc"],
|
if is_comments:
|
||||||
})
|
for i in range(0, 1): self.extract_children(raw_data[i])
|
||||||
return posts
|
self.after = None
|
||||||
|
else:
|
||||||
|
self.extract_children(raw_data)
|
||||||
|
try: self.after = raw_data["data"]["after"]
|
||||||
|
except: None
|
||||||
|
|
||||||
|
return RedditPageData(posts=self.posts, comments=self.comments, after=self.after)
|
||||||
|
|
||||||
|
|
||||||
def parse_comments(data: list):
|
def extract_children(self, data):
|
||||||
comments = []
|
if "data" in data and "children" in data["data"]:
|
||||||
for item in data:
|
for item in data["data"]["children"]: self.children.append(item)
|
||||||
if item["kind"] != "t1": continue # skip non-comment items
|
|
||||||
item = item["data"]
|
|
||||||
comments.append({
|
|
||||||
"id": item["name"],
|
|
||||||
"body": item["body"],
|
|
||||||
"link": item["permalink"],
|
|
||||||
"post_id": item["link_id"],
|
|
||||||
"post_title": item["link_title"],
|
|
||||||
"post_link": item["link_permalink"],
|
|
||||||
|
|
||||||
"author_username": item["author"],
|
"author_username": item["author"],
|
||||||
"author_id": item["author_fullname"],
|
"author_id": item["author_fullname"],
|
||||||
|
Loading…
Reference in New Issue
Block a user