Properly escape post content that contains HTML tags; fix issue with scraping when metadata is in an unanticipated form

This commit is contained in:
John Stephani 2026-01-30 19:42:40 -06:00
parent 192d4e739e
commit b7d80002a1
3 changed files with 280 additions and 274 deletions

View File

@ -278,6 +278,8 @@ class YARS:
id = item["media_id"]
if id in post_data["media_metadata"]:
metadata = post_data["media_metadata"][id]
if "p" not in metadata:
continue
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
if m:
media_urls.append(f"https://i.redd.it/{m.group(1)}")
@ -286,6 +288,8 @@ class YARS:
media_urls = []
for id in post_data["media_metadata"]:
metadata = post_data["media_metadata"][id]
if "p" not in metadata:
continue
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
if m:
media_urls.append(f"https://i.redd.it/{m.group(1)}")

View File

@ -4,6 +4,7 @@ from urllib.parse import urlparse
import delete_posts
import config
import json
import html
import re
import sqlite3
import subprocess
@ -415,6 +416,7 @@ def add_age_to_posts(posts):
def reformat_body(posts):
for post in posts:
if "body" in post and post["body"] is not None:
post["body"] = html.escape(post["body"])
post["body"] = post["body"].rstrip().replace("\n", "<br>")
post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"])

View File

@ -36,7 +36,7 @@ def run():
print("Deleting old media db rows")
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
cursor.execute(delete)
print("Deleving media db for read posts")
print("Deleting media db for read posts")
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
binds = [True]
cursor.execute(delete, binds)