Properly escape post content that contains HTML tags; fix issue with scraping when metadata is in an unanticipated form
This commit is contained in:
parent
192d4e739e
commit
b7d80002a1
|
|
@ -278,6 +278,8 @@ class YARS:
|
|||
id = item["media_id"]
|
||||
if id in post_data["media_metadata"]:
|
||||
metadata = post_data["media_metadata"][id]
|
||||
if "p" not in metadata:
|
||||
continue
|
||||
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
|
||||
if m:
|
||||
media_urls.append(f"https://i.redd.it/{m.group(1)}")
|
||||
|
|
@ -286,6 +288,8 @@ class YARS:
|
|||
media_urls = []
|
||||
for id in post_data["media_metadata"]:
|
||||
metadata = post_data["media_metadata"][id]
|
||||
if "p" not in metadata:
|
||||
continue
|
||||
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
|
||||
if m:
|
||||
media_urls.append(f"https://i.redd.it/{m.group(1)}")
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from urllib.parse import urlparse
|
|||
import delete_posts
|
||||
import config
|
||||
import json
|
||||
import html
|
||||
import re
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
|
@ -415,6 +416,7 @@ def add_age_to_posts(posts):
|
|||
def reformat_body(posts):
|
||||
for post in posts:
|
||||
if "body" in post and post["body"] is not None:
|
||||
post["body"] = html.escape(post["body"])
|
||||
post["body"] = post["body"].rstrip().replace("\n", "<br>")
|
||||
post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"])
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ def run():
|
|||
print("Deleting old media db rows")
|
||||
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
|
||||
cursor.execute(delete)
|
||||
print("Deleving media db for read posts")
|
||||
print("Deleting media db for read posts")
|
||||
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
|
||||
binds = [True]
|
||||
cursor.execute(delete, binds)
|
||||
|
|
|
|||
Loading…
Reference in New Issue