Properly escape post content that contains HTML tags; fix issue with scraping when metadata is in an unanticipated form

This commit is contained in:
John Stephani 2026-01-30 19:42:40 -06:00
parent 192d4e739e
commit b7d80002a1
3 changed files with 280 additions and 274 deletions

View File

@ -278,6 +278,8 @@ class YARS:
id = item["media_id"] id = item["media_id"]
if id in post_data["media_metadata"]: if id in post_data["media_metadata"]:
metadata = post_data["media_metadata"][id] metadata = post_data["media_metadata"][id]
if "p" not in metadata:
continue
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"]) m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
if m: if m:
media_urls.append(f"https://i.redd.it/{m.group(1)}") media_urls.append(f"https://i.redd.it/{m.group(1)}")
@ -286,6 +288,8 @@ class YARS:
media_urls = [] media_urls = []
for id in post_data["media_metadata"]: for id in post_data["media_metadata"]:
metadata = post_data["media_metadata"][id] metadata = post_data["media_metadata"][id]
if "p" not in metadata:
continue
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"]) m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
if m: if m:
media_urls.append(f"https://i.redd.it/{m.group(1)}") media_urls.append(f"https://i.redd.it/{m.group(1)}")

View File

@ -4,6 +4,7 @@ from urllib.parse import urlparse
import delete_posts import delete_posts
import config import config
import json import json
import html
import re import re
import sqlite3 import sqlite3
import subprocess import subprocess
@ -415,6 +416,7 @@ def add_age_to_posts(posts):
def reformat_body(posts): def reformat_body(posts):
for post in posts: for post in posts:
if "body" in post and post["body"] is not None: if "body" in post and post["body"] is not None:
post["body"] = html.escape(post["body"])
post["body"] = post["body"].rstrip().replace("\n", "<br>") post["body"] = post["body"].rstrip().replace("\n", "<br>")
post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"]) post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"])

View File

@ -36,7 +36,7 @@ def run():
print("Deleting old media db rows") print("Deleting old media db rows")
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)" delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
cursor.execute(delete) cursor.execute(delete)
print("Deleving media db for read posts") print("Deleting media db for read posts")
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)" delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
binds = [True] binds = [True]
cursor.execute(delete, binds) cursor.execute(delete, binds)