diff --git a/app/YARS/src/yars/yars.py b/app/YARS/src/yars/yars.py index eedb97f..aff744e 100644 --- a/app/YARS/src/yars/yars.py +++ b/app/YARS/src/yars/yars.py @@ -260,6 +260,7 @@ class YARS: post_data = post["data"] post_info = { "title": post_data["title"], + "subreddit": post_data["subreddit"], "author": post_data["author"], "permalink": post_data["permalink"], "score": post_data["score"], diff --git a/app/app.py b/app/app.py index e447b01..cc704fa 100755 --- a/app/app.py +++ b/app/app.py @@ -58,39 +58,64 @@ def admin(): connection = sqlite3.connect(config.db_file) cursor = connection.cursor() if request.method == 'DELETE': - delete = """ - DELETE FROM - subreddit - WHERE - subreddit = ? - """ + type = request.args.get("type") + if type == "sub": + delete = """ + DELETE FROM + subreddit + WHERE + subreddit = ? + """ + elif type == "block": + delete = """ + DELETE FROM + block + WHERE + name = ? + """ + else: + connection.close() + return "" binds = [request.args.get("name")] cursor.execute(delete, binds) connection.commit() connection.close() return "" elif request.method == 'POST': - upsert = """ - INSERT INTO - subreddit (subreddit, minimum_score, fetch_by, fetch_max) - VALUES - (?, ?, ?, ?) - ON CONFLICT - (subreddit) - DO UPDATE SET - minimum_score=excluded.minimum_score, - fetch_by=excluded.fetch_by, - fetch_max=excluded.fetch_max - """ - binds = [ - request.form.get("name"), - int(request.form.get("score")), - request.form.get("by"), - int(request.form.get("max")) - ] + type = request.form.get("type") + if type == "sub": + upsert = """ + INSERT INTO + subreddit (subreddit, minimum_score, fetch_by, fetch_max) + VALUES + (?, ?, ?, ?) + ON CONFLICT + (subreddit) + DO UPDATE SET + minimum_score=excluded.minimum_score, + fetch_by=excluded.fetch_by, + fetch_max=excluded.fetch_max + """ + binds = [ + request.form.get("name"), + int(request.form.get("score")), + request.form.get("by"), + int(request.form.get("max")) + ] + elif type == "block": + upsert = """ + INSERT OR IGNORE INTO + block (name) + VALUES + (?) + """ + binds = [request.form.get("name")] + else: + connection.close() + return "" cursor.execute(upsert, binds) connection.commit() - post_subreddits = get_subreddits(cursor) + sidebar_links = get_sidebar_links(cursor) select = """ SELECT subreddit, @@ -100,16 +125,24 @@ def admin(): FROM subreddit """ - sub_subreddits = cursor.execute(select).fetchall() + subreddits = cursor.execute(select).fetchall() + select = """ + SELECT + name + FROM + block + """ + rows = cursor.execute(select).fetchall() + blocks = [row[0] for row in rows] connection.close() - return render_template('admin.html', post_subreddits=post_subreddits, sub_subreddits=sub_subreddits) + return render_template('admin.html', sidebar_links=sidebar_links, subreddits=subreddits, blocks=blocks) @app.route('/r/all') def front_page(): title = "/r/all" connection = sqlite3.connect(config.db_file) cursor = connection.cursor() - subreddits = get_subreddits(cursor) + sidebar_links = get_sidebar_links(cursor) select = """ SELECT post @@ -123,16 +156,15 @@ def front_page(): """ binds = [False, config.posts_per_page_load] posts = get_posts_from_select(cursor, select, binds) - add_subreddits_to_posts(posts) connection.close() - return render_template('index.html', title=title, posts=posts, subreddits=subreddits) + return render_template('index.html', title=title, posts=posts, sidebar_links=sidebar_links) @app.route('/r/other') def other_page(): title = "/r/other" connection = sqlite3.connect(config.db_file) cursor = connection.cursor() - subreddits = get_subreddits(cursor) + sidebar_links = get_sidebar_links(cursor) select = """ SELECT post @@ -165,16 +197,15 @@ def other_page(): """ binds = [False, False, config.other_posts_cutoff, config.posts_per_page_load] posts = get_posts_from_select(cursor, select, binds) - add_subreddits_to_posts(posts) connection.close() - return render_template('index.html', title=title, posts=posts, subreddits=subreddits) + return render_template('index.html', title=title, posts=posts, sidebar_links=sidebar_links) @app.route('/r/') def get_subreddit(subreddit): title = f"/r/{subreddit}" connection = sqlite3.connect(config.db_file) cursor = connection.cursor() - subreddits = get_subreddits(cursor) + sidebar_links = get_sidebar_links(cursor) select = """ SELECT post @@ -190,9 +221,9 @@ def get_subreddit(subreddit): binds = [subreddit, False, config.posts_per_page_load] posts = get_posts_from_select(cursor, select, binds) connection.close() - return render_template('index.html', title=title, posts=posts, subreddits=subreddits) + return render_template('index.html', title=title, posts=posts, sidebar_links=sidebar_links) -def get_subreddits(cursor): +def get_sidebar_links(cursor): select = """ SELECT subreddit @@ -215,16 +246,17 @@ def get_subreddits(cursor): """ binds = [False, config.other_posts_cutoff] results = cursor.execute(select, binds).fetchall() - subreddits = [f"/r/{sub[0]}" for sub in results] - subreddits.insert(0, "/r/all") - subreddits.append("/r/other") - subreddits.append("/admin") - return subreddits + links = [f"/r/{sub[0]}" for sub in results] + links.insert(0, "/r/all") + links.append("/r/other") + links.append("/admin") + return links def get_posts_from_select(cursor, select, binds): results = cursor.execute(select, binds).fetchall() posts = [json.loads(post[0]) for post in results] add_media_html_to_posts(posts) + add_subreddits_to_posts(posts) return posts def add_media_html_to_posts(posts): @@ -239,9 +271,11 @@ def add_media_html_to_posts(posts): post["media_html"] = media_html def add_subreddits_to_posts(posts): + # todo, remove after 30 days once subreddit is naturally a part of the post data for post in posts: - m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"]) - post["subreddit"] = f"/r/{m.group(1)}" + if "subreddit" not in post: + m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"]) + post["subreddit"] = m.group(1) def get_media_html(file, priority=False): diff --git a/app/config.py b/app/config.py index f1881b6..5572aac 100644 --- a/app/config.py +++ b/app/config.py @@ -1,10 +1,10 @@ # Scheduler configuration max_age_days = 30 max_age_seconds = max_age_days * 24 * 60 * 60 -other_posts_cutoff = 1 #subreddits with this many unread posts or fewer are merged to /r/other +other_posts_cutoff = 4 #subreddits with this many unread posts or fewer are merged to /r/other # Webpage configuration -posts_per_page_load = 50 +posts_per_page_load = 25 db_dir = "/reddit/db" media_dir = "/reddit/media" diff --git a/app/delete_posts.py b/app/delete_posts.py index 957c229..a342f39 100644 --- a/app/delete_posts.py +++ b/app/delete_posts.py @@ -4,42 +4,50 @@ import time import sqlite3 import subprocess -if __name__ == "__main__": - connection = sqlite3.connect(config.db_file) - cursor = connection.cursor() - now = int(time.time()) - max_created_utc = now - config.max_age_seconds - print("Deleting old posts") - delete = "DELETE FROM post WHERE created_utc < ?" - binds = [max_created_utc] - cursor.execute(delete, binds) - print("Deleting old media db rows") - delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)" - cursor.execute(delete) - print("Deleving media db for read posts") - delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)" - binds = [True] - cursor.execute(delete, binds) - print("Updating media_fetched for read posts") - update = "UPDATE post SET media_fetched = ? WHERE hidden = ?" - binds = [False, True] - cursor.execute(update, binds) - all_files_local = subprocess.run(["find", "/reddit/media", "-type", "f"], capture_output=True, text=True) - all_files_local = set(all_files_local.stdout.splitlines()) - select = "SELECT local from media" - results = cursor.execute(select).fetchall() - connection.commit() - connection.close() - all_files_db = set([row[0] for row in results]) - extra_files = all_files_local - all_files_db - print("Deleting old files") - for file in extra_files: - print(f"Removing {file}") - os.remove(file) - empty_dirs = subprocess.run(["find", "/reddit/media", "-type", "d", "-empty"], capture_output=True, text=True) - empty_dirs = set(empty_dirs.stdout.splitlines()) - print("Deleting empty directories") - for dir in empty_dirs: - print(f"Removind dir {dir}") - os.rmdir(dir) - print("Done") +def run(): + connection = sqlite3.connect(config.db_file) + cursor = connection.cursor() + now = int(time.time()) + max_created_utc = now - config.max_age_seconds + print("Deleting old posts") + delete = "DELETE FROM post WHERE created_utc < ?" + binds = [max_created_utc] + cursor.execute(delete, binds) + print("Deleting posts from blocked subreddits") + select = "SELECT name FROM block WHERE name like '/r/%'" + binds = [row[0][3:] for row in cursor.execute(select).fetchall()] + bind_array = ",".join(["?"]*len(binds)) + delete = f"DELETE FROM post WHERE subreddit IN ({bind_array})" + cursor.execute(delete, binds) + print("Deleting old media db rows") + delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)" + cursor.execute(delete) + print("Deleving media db for read posts") + delete = "DELETE FROM media WHERE permalink IN (SELECT permalink FROM post WHERE hidden = ?)" + binds = [True] + cursor.execute(delete, binds) + print("Updating media_fetched for read posts") + update = "UPDATE post SET media_fetched = ? WHERE hidden = ?" + binds = [False, True] + cursor.execute(update, binds) + all_files_local = subprocess.run(["find", "/reddit/media", "-type", "f"], capture_output=True, text=True) + all_files_local = set(all_files_local.stdout.splitlines()) + select = "SELECT local from media" + results = cursor.execute(select).fetchall() + connection.commit() + connection.close() + all_files_db = set([row[0] for row in results]) + extra_files = all_files_local - all_files_db + print("Deleting old files") + for file in extra_files: + print(f"Removing {file}") + os.remove(file) + empty_dirs = subprocess.run(["find", "/reddit/media", "-type", "d", "-empty"], capture_output=True, text=True) + empty_dirs = set(empty_dirs.stdout.splitlines()) + print("Deleting empty directories") + for dir in empty_dirs: + print(f"Removind dir {dir}") + os.rmdir(dir) + print("Done") + +run() \ No newline at end of file diff --git a/app/make_db.py b/app/make_db.py index 6edc0f6..4782ccd 100755 --- a/app/make_db.py +++ b/app/make_db.py @@ -9,5 +9,6 @@ cursor = connection.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS post(permalink primary key, subreddit, created_utc, score, media_fetched, post, hidden)") cursor.execute("CREATE TABLE IF NOT EXISTS media(permalink, url , local, PRIMARY KEY (permalink, url))") cursor.execute("CREATE TABLE IF NOT EXISTS subreddit(subreddit primary key, minimum_score, fetch_by, fetch_max)") +cursor.execute("CREATE TABLE IF NOT EXISTS block(name primary key)") connection.commit() connection.close() \ No newline at end of file diff --git a/app/scrape_posts.py b/app/scrape_posts.py index e04f8b7..649010c 100755 --- a/app/scrape_posts.py +++ b/app/scrape_posts.py @@ -18,7 +18,7 @@ from yars.utils import download_image miner = YARS() # Function to scrape subreddit post details and save to JSON -def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5): +def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5, blocked_subs=[], blocked_users=[]): ret = [] print(f"Starting {subreddit} with min score {minimum_score}, by {pull_by}, limit {limit}") empty = dict() @@ -32,6 +32,7 @@ def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5): break post_data = { "permalink": post.get("permalink"), + "subreddit": post.get("subreddit"), "title": post.get("title", ""), "author": post.get("author", ""), "created_utc": post.get("created_utc", ""), @@ -40,6 +41,12 @@ def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5): "media_urls" : post.get("media_urls", []), "body": post.get("body", None), } + if post_data["subreddit"] in blocked_subs: + print(f'Ignoring post from {post_data["subreddit"]}') + continue + if post_data["author"] in blocked_users: + print(f'Ignoring post from {post_data["author"]}') + continue ret.append(post_data) print(f"Finished {subreddit}") return ret @@ -56,8 +63,7 @@ def save_posts_to_db(data, cursor): binds = [] for post in data: binds.append(post["permalink"]) - m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"]) - binds.append(m.group(1)) #subreddit + binds.append(post["subreddit"]) binds.append(post["created_utc"]) binds.append(post["score"]) binds.append(False) @@ -103,6 +109,23 @@ def download_comments_for_permalink(permalink, cursor): binds = [permalink, post_details["comments"]] cursor.execute(upsert, binds) +def get_blocks(cursor): + select = """ + SELECT + name + FROM + block + """ + blocks = [row[0] for row in cursor.execute(select)] + subs = [] + users = [] + for block in blocks: + if "/r/" in block: + subs.append(block[3:]) + elif "/u/" in block: + users.append(block[3:]) + return subs, users + # Main execution if __name__ == "__main__": os.makedirs(config.media_dir, exist_ok=True) @@ -118,8 +141,9 @@ if __name__ == "__main__": subreddit """ subreddits = cursor.execute(select).fetchall() + blocked_subs, blocked_users = get_blocks(cursor) for subreddit in subreddits: - post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3]) + post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3], blocked_subs, blocked_users) save_posts_to_db(post_data, cursor) connection.commit() download_media(cursor) diff --git a/app/templates/admin.html b/app/templates/admin.html index 77a9b58..5521a0e 100755 --- a/app/templates/admin.html +++ b/app/templates/admin.html @@ -170,14 +170,14 @@

Admin Panel

-

Subreddits

+

Subreddits

@@ -186,19 +186,20 @@ - {% for subreddit in sub_subreddits %} + {% for subreddit in subreddits %} - + {% endfor %} +
SubredditFetch max Update
/r/{{ subreddit[0] }} {{ subreddit[1] }} {{ subreddit[2] }} {{ subreddit[3] }}
/r/ @@ -213,11 +214,34 @@
+
+

Blocked

+ + + + + {% for block in blocks %} + + + + + {% endfor %} + + + + + + + + + +
Name
{{ block }}
+