Properly escape post content that contains HTML tags; fix issue with scraping when metadata is in an unanticipated form

This commit is contained in:
John Stephani 2026-01-30 19:42:40 -06:00
parent 192d4e739e
commit b7d80002a1
3 changed files with 280 additions and 274 deletions

View File

@ -9,312 +9,316 @@ from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
logger = logging.basicConfig( logger = logging.basicConfig(
filename="YARS.log", filename="YARS.log",
level=logging.INFO, level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s", format="%(asctime)s - %(levelname)s - %(message)s",
) )
class YARS: class YARS:
__slots__ = ("headers", "session", "proxy", "timeout") __slots__ = ("headers", "session", "proxy", "timeout")
def __init__(self, proxy=None, timeout=10, random_user_agent=True): def __init__(self, proxy=None, timeout=10, random_user_agent=True):
self.session = RandomUserAgentSession() if random_user_agent else requests.Session() self.session = RandomUserAgentSession() if random_user_agent else requests.Session()
self.proxy = proxy self.proxy = proxy
self.timeout = timeout self.timeout = timeout
retries = Retry( retries = Retry(
total=5, total=5,
backoff_factor=2, # Exponential backoff backoff_factor=2, # Exponential backoff
status_forcelist=[429, 500, 502, 503, 504], status_forcelist=[429, 500, 502, 503, 504],
) )
self.session.mount("https://", HTTPAdapter(max_retries=retries)) self.session.mount("https://", HTTPAdapter(max_retries=retries))
if proxy: if proxy:
self.session.proxies.update({"http": proxy, "https": proxy}) self.session.proxies.update({"http": proxy, "https": proxy})
def handle_search(self,url, params, after=None, before=None): def handle_search(self,url, params, after=None, before=None):
if after: if after:
params["after"] = after params["after"] = after
if before: if before:
params["before"] = before params["before"] = before
try: try:
response = self.session.get(url, params=params, timeout=self.timeout) response = self.session.get(url, params=params, timeout=self.timeout)
response.raise_for_status() response.raise_for_status()
logging.info("Search request successful") logging.info("Search request successful")
except Exception as e: except Exception as e:
if response.status_code != 200: if response.status_code != 200:
logging.info("Search request unsuccessful due to: %s", e) logging.info("Search request unsuccessful due to: %s", e)
print(f"Failed to fetch search results: {response.status_code}") print(f"Failed to fetch search results: {response.status_code}")
return [] return []
data = response.json() data = response.json()
results = [] results = []
for post in data["data"]["children"]: for post in data["data"]["children"]:
post_data = post["data"] post_data = post["data"]
results.append( results.append(
{ {
"title": post_data["title"], "title": post_data["title"],
"link": f"https://www.reddit.com{post_data['permalink']}", "link": f"https://www.reddit.com{post_data['permalink']}",
"description": post_data.get("selftext", "")[:269], "description": post_data.get("selftext", "")[:269],
} }
) )
logging.info("Search Results Retrned %d Results", len(results)) logging.info("Search Results Retrned %d Results", len(results))
return results return results
def search_reddit(self, query, limit=10, after=None, before=None): def search_reddit(self, query, limit=10, after=None, before=None):
url = "https://www.reddit.com/search.json" url = "https://www.reddit.com/search.json"
params = {"q": query, "limit": limit, "sort": "relevance", "type": "link"} params = {"q": query, "limit": limit, "sort": "relevance", "type": "link"}
return self.handle_search(url, params, after, before) return self.handle_search(url, params, after, before)
def search_subreddit(self, subreddit, query, limit=10, after=None, before=None, sort="relevance"): def search_subreddit(self, subreddit, query, limit=10, after=None, before=None, sort="relevance"):
url = f"https://www.reddit.com/r/{subreddit}/search.json" url = f"https://www.reddit.com/r/{subreddit}/search.json"
params = {"q": query, "limit": limit, "sort": "relevance", "type": "link","restrict_sr":"on"} params = {"q": query, "limit": limit, "sort": "relevance", "type": "link","restrict_sr":"on"}
return self.handle_search(url, params, after, before) return self.handle_search(url, params, after, before)
def scrape_post_details(self, permalink): def scrape_post_details(self, permalink):
url = f"https://www.reddit.com{permalink}.json" url = f"https://www.reddit.com{permalink}.json"
try: try:
response = self.session.get(url, timeout=self.timeout) response = self.session.get(url, timeout=self.timeout)
response.raise_for_status() response.raise_for_status()
logging.info("Post details request successful : %s", url) logging.info("Post details request successful : %s", url)
except Exception as e: except Exception as e:
logging.info("Post details request unsccessful: %e", e) logging.info("Post details request unsccessful: %e", e)
if response.status_code != 200: if response.status_code != 200:
print(f"Failed to fetch post data: {response.status_code}") print(f"Failed to fetch post data: {response.status_code}")
return None return None
post_data = response.json() post_data = response.json()
if not isinstance(post_data, list) or len(post_data) < 2: if not isinstance(post_data, list) or len(post_data) < 2:
logging.info("Unexpected post data structre") logging.info("Unexpected post data structre")
print("Unexpected post data structure") print("Unexpected post data structure")
return None return None
main_post = post_data[0]["data"]["children"][0]["data"] main_post = post_data[0]["data"]["children"][0]["data"]
title = main_post["title"] title = main_post["title"]
body = main_post.get("selftext", "") body = main_post.get("selftext", "")
comments = self._extract_comments(post_data[1]["data"]["children"]) comments = self._extract_comments(post_data[1]["data"]["children"])
logging.info("Successfully scraped post: %s", title) logging.info("Successfully scraped post: %s", title)
return {"title": title, "body": body, "comments": comments} return {"title": title, "body": body, "comments": comments}
def _extract_comments(self, comments): def _extract_comments(self, comments):
logging.info("Extracting comments") logging.info("Extracting comments")
extracted_comments = [] extracted_comments = []
for comment in comments: for comment in comments:
if isinstance(comment, dict) and comment.get("kind") == "t1": if isinstance(comment, dict) and comment.get("kind") == "t1":
comment_data = comment.get("data", {}) comment_data = comment.get("data", {})
extracted_comment = { extracted_comment = {
"author": comment_data.get("author", ""), "author": comment_data.get("author", ""),
"body": comment_data.get("body", ""), "body": comment_data.get("body", ""),
"score": comment_data.get("score",""), "score": comment_data.get("score",""),
"replies": [], "replies": [],
} }
replies = comment_data.get("replies", "") replies = comment_data.get("replies", "")
if isinstance(replies, dict): if isinstance(replies, dict):
extracted_comment["replies"] = self._extract_comments( extracted_comment["replies"] = self._extract_comments(
replies.get("data", {}).get("children", []) replies.get("data", {}).get("children", [])
) )
extracted_comments.append(extracted_comment) extracted_comments.append(extracted_comment)
logging.info("Successfully extracted comments") logging.info("Successfully extracted comments")
return extracted_comments return extracted_comments
def scrape_user_data(self, username, limit=10): def scrape_user_data(self, username, limit=10):
logging.info("Scraping user data for %s, limit: %d", username, limit) logging.info("Scraping user data for %s, limit: %d", username, limit)
base_url = f"https://www.reddit.com/user/{username}/.json" base_url = f"https://www.reddit.com/user/{username}/.json"
params = {"limit": limit, "after": None} params = {"limit": limit, "after": None}
all_items = [] all_items = []
count = 0 count = 0
while count < limit: while count < limit:
try: try:
response = self.session.get( response = self.session.get(
base_url, params=params, timeout=self.timeout base_url, params=params, timeout=self.timeout
) )
response.raise_for_status() response.raise_for_status()
logging.info("User data request successful") logging.info("User data request successful")
except Exception as e: except Exception as e:
logging.info("User data request unsuccessful: %s", e) logging.info("User data request unsuccessful: %s", e)
if response.status_code != 200: if response.status_code != 200:
print( print(
f"Failed to fetch data for user {username}: {response.status_code}" f"Failed to fetch data for user {username}: {response.status_code}"
) )
break break
try: try:
data = response.json() data = response.json()
except ValueError: except ValueError:
print(f"Failed to parse JSON response for user {username}.") print(f"Failed to parse JSON response for user {username}.")
break break
if "data" not in data or "children" not in data["data"]: if "data" not in data or "children" not in data["data"]:
print( print(
f"No 'data' or 'children' field found in response for user {username}." f"No 'data' or 'children' field found in response for user {username}."
) )
logging.info("No 'data' or 'children' field found in response") logging.info("No 'data' or 'children' field found in response")
break break
items = data["data"]["children"] items = data["data"]["children"]
if not items: if not items:
print(f"No more items found for user {username}.") print(f"No more items found for user {username}.")
logging.info("No more items found for user") logging.info("No more items found for user")
break break
for item in items: for item in items:
kind = item["kind"] kind = item["kind"]
item_data = item["data"] item_data = item["data"]
if kind == "t3": if kind == "t3":
post_url = f"https://www.reddit.com{item_data.get('permalink', '')}" post_url = f"https://www.reddit.com{item_data.get('permalink', '')}"
all_items.append( all_items.append(
{ {
"type": "post", "type": "post",
"title": item_data.get("title", ""), "title": item_data.get("title", ""),
"subreddit": item_data.get("subreddit", ""), "subreddit": item_data.get("subreddit", ""),
"url": post_url, "url": post_url,
"created_utc": item_data.get("created_utc", ""), "created_utc": item_data.get("created_utc", ""),
} }
) )
elif kind == "t1": elif kind == "t1":
comment_url = ( comment_url = (
f"https://www.reddit.com{item_data.get('permalink', '')}" f"https://www.reddit.com{item_data.get('permalink', '')}"
) )
all_items.append( all_items.append(
{ {
"type": "comment", "type": "comment",
"subreddit": item_data.get("subreddit", ""), "subreddit": item_data.get("subreddit", ""),
"body": item_data.get("body", ""), "body": item_data.get("body", ""),
"created_utc": item_data.get("created_utc", ""), "created_utc": item_data.get("created_utc", ""),
"url": comment_url, "url": comment_url,
} }
) )
count += 1 count += 1
if count >= limit: if count >= limit:
break break
params["after"] = data["data"].get("after") params["after"] = data["data"].get("after")
if not params["after"]: if not params["after"]:
break break
time.sleep(random.uniform(1, 2)) time.sleep(random.uniform(1, 2))
logging.info("Sleeping for random time") logging.info("Sleeping for random time")
logging.info("Successfully scraped user data for %s", username) logging.info("Successfully scraped user data for %s", username)
return all_items return all_items
def fetch_subreddit_posts( def fetch_subreddit_posts(
self, subreddit, limit=10, category="hot", time_filter="all" self, subreddit, limit=10, category="hot", time_filter="all"
): ):
logging.info( logging.info(
"Fetching subreddit/user posts for %s, limit: %d, category: %s, time_filter: %s", "Fetching subreddit/user posts for %s, limit: %d, category: %s, time_filter: %s",
subreddit, subreddit,
limit, limit,
category, category,
time_filter, time_filter,
) )
if category not in ["hot", "top", "new", "userhot", "usertop", "usernew"]: if category not in ["hot", "top", "new", "userhot", "usertop", "usernew"]:
raise ValueError("Category for Subredit must be either 'hot', 'top', or 'new' or for User must be 'userhot', 'usertop', or 'usernew'") raise ValueError("Category for Subredit must be either 'hot', 'top', or 'new' or for User must be 'userhot', 'usertop', or 'usernew'")
batch_size = min(100, limit) batch_size = min(100, limit)
total_fetched = 0 total_fetched = 0
after = None after = None
all_posts = [] all_posts = []
while total_fetched < limit: while total_fetched < limit:
if category == "hot": if category == "hot":
url = f"https://www.reddit.com/r/{subreddit}/hot.json" url = f"https://www.reddit.com/r/{subreddit}/hot.json"
elif category == "top": elif category == "top":
url = f"https://www.reddit.com/r/{subreddit}/top.json" url = f"https://www.reddit.com/r/{subreddit}/top.json"
elif category == "new": elif category == "new":
url = f"https://www.reddit.com/r/{subreddit}/new.json" url = f"https://www.reddit.com/r/{subreddit}/new.json"
elif category == "userhot": elif category == "userhot":
url = f"https://www.reddit.com/user/{subreddit}/submitted/hot.json" url = f"https://www.reddit.com/user/{subreddit}/submitted/hot.json"
elif category == "usertop": elif category == "usertop":
url = f"https://www.reddit.com/user/{subreddit}/submitted/top.json" url = f"https://www.reddit.com/user/{subreddit}/submitted/top.json"
else: else:
url = f"https://www.reddit.com/user/{subreddit}/submitted/new.json" url = f"https://www.reddit.com/user/{subreddit}/submitted/new.json"
params = { params = {
"limit": batch_size, "limit": batch_size,
"after": after, "after": after,
"raw_json": 1, "raw_json": 1,
"t": time_filter, "t": time_filter,
} }
try: try:
response = self.session.get(url, params=params, timeout=self.timeout) response = self.session.get(url, params=params, timeout=self.timeout)
response.raise_for_status() response.raise_for_status()
logging.info("Subreddit/user posts request successful") logging.info("Subreddit/user posts request successful")
except Exception as e: except Exception as e:
logging.info("Subreddit/user posts request unsuccessful: %s", e) logging.info("Subreddit/user posts request unsuccessful: %s", e)
if response.status_code != 200: if response.status_code != 200:
print( print(
f"Failed to fetch posts for subreddit/user {subreddit}: {response.status_code}" f"Failed to fetch posts for subreddit/user {subreddit}: {response.status_code}"
) )
break break
data = response.json() data = response.json()
posts = data["data"]["children"] posts = data["data"]["children"]
if not posts: if not posts:
break break
for post in posts: for post in posts:
post_data = post["data"] post_data = post["data"]
post_info = { post_info = {
"title": post_data["title"], "title": post_data["title"],
"subreddit": post_data["subreddit"], "subreddit": post_data["subreddit"],
"author": post_data["author"], "author": post_data["author"],
"permalink": post_data["permalink"], "permalink": post_data["permalink"],
"score": post_data["score"], "score": post_data["score"],
"num_comments": post_data["num_comments"], "num_comments": post_data["num_comments"],
"created_utc": post_data["created_utc"], "created_utc": post_data["created_utc"],
} }
if "selftext" in post_data: if "selftext" in post_data:
body = post_data["selftext"] body = post_data["selftext"]
if body != None and len(body)>0: if body != None and len(body)>0:
post_info["body"] = body post_info["body"] = body
if "gallery_data" in post_data and "media_metadata" in post_data: if "gallery_data" in post_data and "media_metadata" in post_data:
items = post_data["gallery_data"]["items"] items = post_data["gallery_data"]["items"]
media_urls = [] media_urls = []
for item in items: for item in items:
id = item["media_id"] id = item["media_id"]
if id in post_data["media_metadata"]: if id in post_data["media_metadata"]:
metadata = post_data["media_metadata"][id] metadata = post_data["media_metadata"][id]
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"]) if "p" not in metadata:
if m: continue
media_urls.append(f"https://i.redd.it/{m.group(1)}") m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
post_info["media_urls"] = media_urls if m:
elif "media_metadata" in post_data: media_urls.append(f"https://i.redd.it/{m.group(1)}")
media_urls = [] post_info["media_urls"] = media_urls
for id in post_data["media_metadata"]: elif "media_metadata" in post_data:
metadata = post_data["media_metadata"][id] media_urls = []
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"]) for id in post_data["media_metadata"]:
if m: metadata = post_data["media_metadata"][id]
media_urls.append(f"https://i.redd.it/{m.group(1)}") if "p" not in metadata:
post_info["media_urls"] = media_urls continue
elif "media" in post_data and post_data["media"] is not None and "reddit_video" in post_data["media"]: m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
media_url = post_data["media"]["reddit_video"]["fallback_url"] if m:
video_url = media_url[:media_url.find('?')] media_urls.append(f"https://i.redd.it/{m.group(1)}")
audio_url = video_url[:video_url.rfind('/')] + "/CMAF_AUDIO_128.mp4" post_info["media_urls"] = media_urls
post_info["media_urls"] = [video_url, audio_url] elif "media" in post_data and post_data["media"] is not None and "reddit_video" in post_data["media"]:
elif "url" in post_data: media_url = post_data["media"]["reddit_video"]["fallback_url"]
url = post_data["url"] video_url = media_url[:media_url.find('?')]
if re.fullmatch(r"https:\/\/i\.redd\.it\/.{1,20}", url): audio_url = video_url[:video_url.rfind('/')] + "/CMAF_AUDIO_128.mp4"
post_info["media_urls"] = [url] post_info["media_urls"] = [video_url, audio_url]
elif "body" not in post_info: elif "url" in post_data:
post_info["body"] = url url = post_data["url"]
if "thumbnail" in post_data and post_data["thumbnail"] != "self": if re.fullmatch(r"https:\/\/i\.redd\.it\/.{1,20}", url):
post_info["thumbnail_url"] = post_data["thumbnail"] post_info["media_urls"] = [url]
elif "body" not in post_info:
post_info["body"] = url
if "thumbnail" in post_data and post_data["thumbnail"] != "self":
post_info["thumbnail_url"] = post_data["thumbnail"]
all_posts.append(post_info) all_posts.append(post_info)
total_fetched += 1 total_fetched += 1
if total_fetched >= limit: if total_fetched >= limit:
break break
after = data["data"].get("after") after = data["data"].get("after")
if not after: if not after:
break break
time.sleep(random.uniform(1, 2)) time.sleep(random.uniform(1, 2))
logging.info("Sleeping for random time") logging.info("Sleeping for random time")
logging.info("Successfully fetched subreddit posts for %s", subreddit) logging.info("Successfully fetched subreddit posts for %s", subreddit)
return all_posts return all_posts

View File

@ -4,6 +4,7 @@ from urllib.parse import urlparse
import delete_posts import delete_posts
import config import config
import json import json
import html
import re import re
import sqlite3 import sqlite3
import subprocess import subprocess
@ -415,6 +416,7 @@ def add_age_to_posts(posts):
def reformat_body(posts): def reformat_body(posts):
for post in posts: for post in posts:
if "body" in post and post["body"] is not None: if "body" in post and post["body"] is not None:
post["body"] = html.escape(post["body"])
post["body"] = post["body"].rstrip().replace("\n", "<br>") post["body"] = post["body"].rstrip().replace("\n", "<br>")
post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"]) post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"])

View File

@ -36,7 +36,7 @@ def run():
print("Deleting old media db rows") print("Deleting old media db rows")
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)" delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
cursor.execute(delete) cursor.execute(delete)
print("Deleving media db for read posts") print("Deleting media db for read posts")
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)" delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
binds = [True] binds = [True]
cursor.execute(delete, binds) cursor.execute(delete, binds)