Add ability to block subreddits and users, add subreddit link and username to every post

This commit is contained in:
John Stephani 2025-12-27 20:02:27 -06:00
parent 9fc267ec6a
commit 7ecabdf2b7
8 changed files with 204 additions and 100 deletions

View File

@ -260,6 +260,7 @@ class YARS:
post_data = post["data"]
post_info = {
"title": post_data["title"],
"subreddit": post_data["subreddit"],
"author": post_data["author"],
"permalink": post_data["permalink"],
"score": post_data["score"],

View File

@ -58,39 +58,64 @@ def admin():
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
if request.method == 'DELETE':
delete = """
DELETE FROM
subreddit
WHERE
subreddit = ?
"""
type = request.args.get("type")
if type == "sub":
delete = """
DELETE FROM
subreddit
WHERE
subreddit = ?
"""
elif type == "block":
delete = """
DELETE FROM
block
WHERE
name = ?
"""
else:
connection.close()
return ""
binds = [request.args.get("name")]
cursor.execute(delete, binds)
connection.commit()
connection.close()
return ""
elif request.method == 'POST':
upsert = """
INSERT INTO
subreddit (subreddit, minimum_score, fetch_by, fetch_max)
VALUES
(?, ?, ?, ?)
ON CONFLICT
(subreddit)
DO UPDATE SET
minimum_score=excluded.minimum_score,
fetch_by=excluded.fetch_by,
fetch_max=excluded.fetch_max
"""
binds = [
request.form.get("name"),
int(request.form.get("score")),
request.form.get("by"),
int(request.form.get("max"))
]
type = request.form.get("type")
if type == "sub":
upsert = """
INSERT INTO
subreddit (subreddit, minimum_score, fetch_by, fetch_max)
VALUES
(?, ?, ?, ?)
ON CONFLICT
(subreddit)
DO UPDATE SET
minimum_score=excluded.minimum_score,
fetch_by=excluded.fetch_by,
fetch_max=excluded.fetch_max
"""
binds = [
request.form.get("name"),
int(request.form.get("score")),
request.form.get("by"),
int(request.form.get("max"))
]
elif type == "block":
upsert = """
INSERT OR IGNORE INTO
block (name)
VALUES
(?)
"""
binds = [request.form.get("name")]
else:
connection.close()
return ""
cursor.execute(upsert, binds)
connection.commit()
post_subreddits = get_subreddits(cursor)
sidebar_links = get_sidebar_links(cursor)
select = """
SELECT
subreddit,
@ -100,16 +125,24 @@ def admin():
FROM
subreddit
"""
sub_subreddits = cursor.execute(select).fetchall()
subreddits = cursor.execute(select).fetchall()
select = """
SELECT
name
FROM
block
"""
rows = cursor.execute(select).fetchall()
blocks = [row[0] for row in rows]
connection.close()
return render_template('admin.html', post_subreddits=post_subreddits, sub_subreddits=sub_subreddits)
return render_template('admin.html', sidebar_links=sidebar_links, subreddits=subreddits, blocks=blocks)
@app.route('/r/all')
def front_page():
title = "/r/all"
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
subreddits = get_subreddits(cursor)
sidebar_links = get_sidebar_links(cursor)
select = """
SELECT
post
@ -123,16 +156,15 @@ def front_page():
"""
binds = [False, config.posts_per_page_load]
posts = get_posts_from_select(cursor, select, binds)
add_subreddits_to_posts(posts)
connection.close()
return render_template('index.html', title=title, posts=posts, subreddits=subreddits)
return render_template('index.html', title=title, posts=posts, sidebar_links=sidebar_links)
@app.route('/r/other')
def other_page():
title = "/r/other"
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
subreddits = get_subreddits(cursor)
sidebar_links = get_sidebar_links(cursor)
select = """
SELECT
post
@ -165,16 +197,15 @@ def other_page():
"""
binds = [False, False, config.other_posts_cutoff, config.posts_per_page_load]
posts = get_posts_from_select(cursor, select, binds)
add_subreddits_to_posts(posts)
connection.close()
return render_template('index.html', title=title, posts=posts, subreddits=subreddits)
return render_template('index.html', title=title, posts=posts, sidebar_links=sidebar_links)
@app.route('/r/<path:subreddit>')
def get_subreddit(subreddit):
title = f"/r/{subreddit}"
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
subreddits = get_subreddits(cursor)
sidebar_links = get_sidebar_links(cursor)
select = """
SELECT
post
@ -190,9 +221,9 @@ def get_subreddit(subreddit):
binds = [subreddit, False, config.posts_per_page_load]
posts = get_posts_from_select(cursor, select, binds)
connection.close()
return render_template('index.html', title=title, posts=posts, subreddits=subreddits)
return render_template('index.html', title=title, posts=posts, sidebar_links=sidebar_links)
def get_subreddits(cursor):
def get_sidebar_links(cursor):
select = """
SELECT
subreddit
@ -215,16 +246,17 @@ def get_subreddits(cursor):
"""
binds = [False, config.other_posts_cutoff]
results = cursor.execute(select, binds).fetchall()
subreddits = [f"/r/{sub[0]}" for sub in results]
subreddits.insert(0, "/r/all")
subreddits.append("/r/other")
subreddits.append("/admin")
return subreddits
links = [f"/r/{sub[0]}" for sub in results]
links.insert(0, "/r/all")
links.append("/r/other")
links.append("/admin")
return links
def get_posts_from_select(cursor, select, binds):
results = cursor.execute(select, binds).fetchall()
posts = [json.loads(post[0]) for post in results]
add_media_html_to_posts(posts)
add_subreddits_to_posts(posts)
return posts
def add_media_html_to_posts(posts):
@ -239,9 +271,11 @@ def add_media_html_to_posts(posts):
post["media_html"] = media_html
def add_subreddits_to_posts(posts):
# todo, remove after 30 days once subreddit is naturally a part of the post data
for post in posts:
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
post["subreddit"] = f"/r/{m.group(1)}"
if "subreddit" not in post:
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
post["subreddit"] = m.group(1)
def get_media_html(file, priority=False):

View File

@ -1,10 +1,10 @@
# Scheduler configuration
max_age_days = 30
max_age_seconds = max_age_days * 24 * 60 * 60
other_posts_cutoff = 1 #subreddits with this many unread posts or fewer are merged to /r/other
other_posts_cutoff = 4 #subreddits with this many unread posts or fewer are merged to /r/other
# Webpage configuration
posts_per_page_load = 50
posts_per_page_load = 25
db_dir = "/reddit/db"
media_dir = "/reddit/media"

View File

@ -4,42 +4,50 @@ import time
import sqlite3
import subprocess
if __name__ == "__main__":
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
now = int(time.time())
max_created_utc = now - config.max_age_seconds
print("Deleting old posts")
delete = "DELETE FROM post WHERE created_utc < ?"
binds = [max_created_utc]
cursor.execute(delete, binds)
print("Deleting old media db rows")
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
cursor.execute(delete)
print("Deleving media db for read posts")
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
binds = [True]
cursor.execute(delete, binds)
print("Updating media_fetched for read posts")
update = "UPDATE post SET media_fetched = ? WHERE hidden = ?"
binds = [False, True]
cursor.execute(update, binds)
all_files_local = subprocess.run(["find", "/reddit/media", "-type", "f"], capture_output=True, text=True)
all_files_local = set(all_files_local.stdout.splitlines())
select = "SELECT local from media"
results = cursor.execute(select).fetchall()
connection.commit()
connection.close()
all_files_db = set([row[0] for row in results])
extra_files = all_files_local - all_files_db
print("Deleting old files")
for file in extra_files:
print(f"Removing {file}")
os.remove(file)
empty_dirs = subprocess.run(["find", "/reddit/media", "-type", "d", "-empty"], capture_output=True, text=True)
empty_dirs = set(empty_dirs.stdout.splitlines())
print("Deleting empty directories")
for dir in empty_dirs:
print(f"Removind dir {dir}")
os.rmdir(dir)
print("Done")
def run():
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
now = int(time.time())
max_created_utc = now - config.max_age_seconds
print("Deleting old posts")
delete = "DELETE FROM post WHERE created_utc < ?"
binds = [max_created_utc]
cursor.execute(delete, binds)
print("Deleting posts from blocked subreddits")
select = "SELECT name FROM block WHERE name like '/r/%'"
binds = [row[0][3:] for row in cursor.execute(select).fetchall()]
bind_array = ",".join(["?"]*len(binds))
delete = f"DELETE FROM post WHERE subreddit IN ({bind_array})"
cursor.execute(delete, binds)
print("Deleting old media db rows")
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
cursor.execute(delete)
print("Deleving media db for read posts")
delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)"
binds = [True]
cursor.execute(delete, binds)
print("Updating media_fetched for read posts")
update = "UPDATE post SET media_fetched = ? WHERE hidden = ?"
binds = [False, True]
cursor.execute(update, binds)
all_files_local = subprocess.run(["find", "/reddit/media", "-type", "f"], capture_output=True, text=True)
all_files_local = set(all_files_local.stdout.splitlines())
select = "SELECT local from media"
results = cursor.execute(select).fetchall()
connection.commit()
connection.close()
all_files_db = set([row[0] for row in results])
extra_files = all_files_local - all_files_db
print("Deleting old files")
for file in extra_files:
print(f"Removing {file}")
os.remove(file)
empty_dirs = subprocess.run(["find", "/reddit/media", "-type", "d", "-empty"], capture_output=True, text=True)
empty_dirs = set(empty_dirs.stdout.splitlines())
print("Deleting empty directories")
for dir in empty_dirs:
print(f"Removind dir {dir}")
os.rmdir(dir)
print("Done")
run()

View File

@ -9,5 +9,6 @@ cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS post(permalink primary key, subreddit, created_utc, score, media_fetched, post, hidden)")
cursor.execute("CREATE TABLE IF NOT EXISTS media(permalink, url , local, PRIMARY KEY (permalink, url))")
cursor.execute("CREATE TABLE IF NOT EXISTS subreddit(subreddit primary key, minimum_score, fetch_by, fetch_max)")
cursor.execute("CREATE TABLE IF NOT EXISTS block(name primary key)")
connection.commit()
connection.close()

View File

@ -18,7 +18,7 @@ from yars.utils import download_image
miner = YARS()
# Function to scrape subreddit post details and save to JSON
def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5):
def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5, blocked_subs=[], blocked_users=[]):
ret = []
print(f"Starting {subreddit} with min score {minimum_score}, by {pull_by}, limit {limit}")
empty = dict()
@ -32,6 +32,7 @@ def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5):
break
post_data = {
"permalink": post.get("permalink"),
"subreddit": post.get("subreddit"),
"title": post.get("title", ""),
"author": post.get("author", ""),
"created_utc": post.get("created_utc", ""),
@ -40,6 +41,12 @@ def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5):
"media_urls" : post.get("media_urls", []),
"body": post.get("body", None),
}
if post_data["subreddit"] in blocked_subs:
print(f'Ignoring post from {post_data["subreddit"]}')
continue
if post_data["author"] in blocked_users:
print(f'Ignoring post from {post_data["author"]}')
continue
ret.append(post_data)
print(f"Finished {subreddit}")
return ret
@ -56,8 +63,7 @@ def save_posts_to_db(data, cursor):
binds = []
for post in data:
binds.append(post["permalink"])
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
binds.append(m.group(1)) #subreddit
binds.append(post["subreddit"])
binds.append(post["created_utc"])
binds.append(post["score"])
binds.append(False)
@ -103,6 +109,23 @@ def download_comments_for_permalink(permalink, cursor):
binds = [permalink, post_details["comments"]]
cursor.execute(upsert, binds)
def get_blocks(cursor):
select = """
SELECT
name
FROM
block
"""
blocks = [row[0] for row in cursor.execute(select)]
subs = []
users = []
for block in blocks:
if "/r/" in block:
subs.append(block[3:])
elif "/u/" in block:
users.append(block[3:])
return subs, users
# Main execution
if __name__ == "__main__":
os.makedirs(config.media_dir, exist_ok=True)
@ -118,8 +141,9 @@ if __name__ == "__main__":
subreddit
"""
subreddits = cursor.execute(select).fetchall()
blocked_subs, blocked_users = get_blocks(cursor)
for subreddit in subreddits:
post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3])
post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3], blocked_subs, blocked_users)
save_posts_to_db(post_data, cursor)
connection.commit()
download_media(cursor)

View File

@ -170,14 +170,14 @@
<body>
<div class="container">
<div class="sidebar">
{% for subreddit in post_subreddits %}
<a href="{{ subreddit }}">{{ subreddit }}</a>
{% for link in sidebar_links %}
<a href="{{ link }}">{{ link }}</a>
{% endfor %}
</div>
<div class="content">
<h1>Admin Panel</h1>
<div class="post">
<h3>Subreddits</h3>
<h2>Subreddits</h2>
<table>
<tr>
<th>Subreddit</th>
@ -186,19 +186,20 @@
<th>Fetch max</th>
<th>Update</th>
</tr>
{% for subreddit in sub_subreddits %}
{% for subreddit in subreddits %}
<tr>
<td>/r/{{ subreddit[0] }}</td>
<td>{{ subreddit[1] }}</td>
<td>{{ subreddit[2] }}</td>
<td>{{ subreddit[3] }}</td>
<td><button onclick='deleteSubreddit("{{ subreddit[0] }}")'>Delete</button></td>
<td><button onclick='deleteEntry("sub","{{ subreddit[0] }}")'>Delete</button></td>
</tr>
{% endfor %}
<tr></tr>
<tr></tr>
<tr>
<form method="post">
<input name="type" type="text" value="sub" hidden>
<td>/r/<input name="name" type="text"></td>
<td><input name="score" type="text" value="100"></td>
<td>
@ -213,11 +214,34 @@
</tr>
</table>
</div>
<div class="post">
<h2>Blocked</h2>
<table>
<tr>
<th>Name</th>
</tr>
{% for block in blocks %}
<tr>
<td>{{ block }}</td>
<td><button onclick='deleteEntry("block", "{{ block }}")'>Delete</button></td>
</tr>
{% endfor %}
<tr></tr>
<tr></tr>
<tr>
<form method="post">
<input name="type" type="text" value="block" hidden>
<td><input name="name" type="text"></td>
<td><button type="submit">Add</button></td>
</form>
</tr>
</table>
</div>
</div>
</div>
<script>
function deleteSubreddit(name) {
fetch('/admin?name='+name, {
function deleteEntry(type, name) {
fetch('/admin?name='+ name + "&type=" + type, {
method: 'DELETE'
}).then(() => {
window.location.href = window.location.href;

View File

@ -107,6 +107,11 @@
background-color: var(--darker);
color: var(--light);
}
a.no-style-link {
color: inherit;
text-decoration: inherit;
cursor: pointer;
}
.invert {
filter: invert(1);
transition: filter 0.3s;
@ -170,8 +175,8 @@
<body>
<div class="container">
<div class="sidebar">
{% for subreddit in subreddits %}
<a href="{{ subreddit }}">{{ subreddit }}</a>
{% for link in sidebar_links %}
<a href="{{ link }}">{{ link }}</a>
{% endfor %}
</div>
<div class="content">
@ -179,9 +184,16 @@
{% for post in posts %}
<div class="post">
<h3>{{ post.title }}</h3>
{% if post.subreddit %}
<h5>{{ post.subreddit }}</h5>
{% endif %}
<span>
{% if post.subreddit %}
<h5>
<a href="/r/{{ post.subreddit }}" class="no-style-link">/r/{{ post.subreddit }}</a>
{% if post.author %}
— {{ post.author }}
{% endif %}
</h5>
{% endif %}
</span>
{% if post.media_html|length > 0 %}
<div class="media-div">
{% for media in post.media_html %}