Properly escape post content that contains HTML tags; fix issue with scraping when metadata is in an unanticipated form
This commit is contained in:
parent
192d4e739e
commit
b7d80002a1
|
|
@ -278,6 +278,8 @@ class YARS:
|
||||||
id = item["media_id"]
|
id = item["media_id"]
|
||||||
if id in post_data["media_metadata"]:
|
if id in post_data["media_metadata"]:
|
||||||
metadata = post_data["media_metadata"][id]
|
metadata = post_data["media_metadata"][id]
|
||||||
|
if "p" not in metadata:
|
||||||
|
continue
|
||||||
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
|
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
|
||||||
if m:
|
if m:
|
||||||
media_urls.append(f"https://i.redd.it/{m.group(1)}")
|
media_urls.append(f"https://i.redd.it/{m.group(1)}")
|
||||||
|
|
@ -286,6 +288,8 @@ class YARS:
|
||||||
media_urls = []
|
media_urls = []
|
||||||
for id in post_data["media_metadata"]:
|
for id in post_data["media_metadata"]:
|
||||||
metadata = post_data["media_metadata"][id]
|
metadata = post_data["media_metadata"][id]
|
||||||
|
if "p" not in metadata:
|
||||||
|
continue
|
||||||
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
|
m = re.search(r"redd\.it\/(.+)\?", metadata["p"][0]["u"])
|
||||||
if m:
|
if m:
|
||||||
media_urls.append(f"https://i.redd.it/{m.group(1)}")
|
media_urls.append(f"https://i.redd.it/{m.group(1)}")
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from urllib.parse import urlparse
|
||||||
import delete_posts
|
import delete_posts
|
||||||
import config
|
import config
|
||||||
import json
|
import json
|
||||||
|
import html
|
||||||
import re
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
@ -415,6 +416,7 @@ def add_age_to_posts(posts):
|
||||||
def reformat_body(posts):
|
def reformat_body(posts):
|
||||||
for post in posts:
|
for post in posts:
|
||||||
if "body" in post and post["body"] is not None:
|
if "body" in post and post["body"] is not None:
|
||||||
|
post["body"] = html.escape(post["body"])
|
||||||
post["body"] = post["body"].rstrip().replace("\n", "<br>")
|
post["body"] = post["body"].rstrip().replace("\n", "<br>")
|
||||||
post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"])
|
post["body"] = re.sub(r"\[(.*?)\]\((.*?)\)", r'<b><a href="\2" style="white-space: nowrap;" class="no-style-link">\1</a></b>', post["body"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ def run():
|
||||||
print("Deleting old media db rows")
|
print("Deleting old media db rows")
|
||||||
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
|
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
|
||||||
cursor.execute(delete)
|
cursor.execute(delete)
|
||||||
print("Deleving media db for read posts")
|
print("Deleting media db for read posts")
|
||||||
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
|
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
|
||||||
binds = [True]
|
binds = [True]
|
||||||
cursor.execute(delete, binds)
|
cursor.execute(delete, binds)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue