diff --git a/README.md b/README.md
index fa7064a..e9c76c3 100755
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ You can change the host port, host volume directories, how often reddit is scann
## ./app/config.py
-You can change how much data is pulled, from where, the minimum score to save it to your DB, and how long it is retained.
+You can change how many posts are displayed per page load and how long data is retained.
### Startup
@@ -15,13 +15,13 @@ docker compose build
docker compose up
```
-The DB is created automatically. You will want to run
+The DB is created automatically. You will want to visit the /admin endpoint to set up your subreddits, then run
```
docker exec -it reddit-web-1 sh -c "python3 /app/scrape_posts.py"
```
-to populate the DB with initial data, or you will have to wait for the scheduled task to get triggered for the web page to be usable.
+to populate the DB with initial data, or you will have to wait for the scheduled task to get triggered for posts to start showing.
### Thanks
diff --git a/app/app.py b/app/app.py
index 020bcca..e447b01 100755
--- a/app/app.py
+++ b/app/app.py
@@ -40,8 +40,70 @@ def hide_post(permalink):
@app.route('/')
def index():
+ connection = sqlite3.connect(config.db_file)
+ cursor = connection.cursor()
+ select = """
+ SELECT
+ count(*)
+ FROM
+ subreddit
+ """
+ count = cursor.execute(select).fetchone()[0]
+ if count == 0:
+ return admin()
return front_page()
+@app.route('/admin', methods=['GET', 'POST', 'DELETE'])
+def admin():
+ connection = sqlite3.connect(config.db_file)
+ cursor = connection.cursor()
+ if request.method == 'DELETE':
+ delete = """
+ DELETE FROM
+ subreddit
+ WHERE
+ subreddit = ?
+ """
+ binds = [request.args.get("name")]
+ cursor.execute(delete, binds)
+ connection.commit()
+ connection.close()
+ return ""
+ elif request.method == 'POST':
+ upsert = """
+ INSERT INTO
+ subreddit (subreddit, minimum_score, fetch_by, fetch_max)
+ VALUES
+ (?, ?, ?, ?)
+ ON CONFLICT
+ (subreddit)
+ DO UPDATE SET
+ minimum_score=excluded.minimum_score,
+ fetch_by=excluded.fetch_by,
+ fetch_max=excluded.fetch_max
+ """
+ binds = [
+ request.form.get("name"),
+ int(request.form.get("score")),
+ request.form.get("by"),
+ int(request.form.get("max"))
+ ]
+ cursor.execute(upsert, binds)
+ connection.commit()
+ post_subreddits = get_subreddits(cursor)
+ select = """
+ SELECT
+ subreddit,
+ minimum_score,
+ fetch_by,
+ fetch_max
+ FROM
+ subreddit
+ """
+ sub_subreddits = cursor.execute(select).fetchall()
+ connection.close()
+ return render_template('admin.html', post_subreddits=post_subreddits, sub_subreddits=sub_subreddits)
+
@app.route('/r/all')
def front_page():
title = "/r/all"
@@ -156,6 +218,7 @@ def get_subreddits(cursor):
subreddits = [f"/r/{sub[0]}" for sub in results]
subreddits.insert(0, "/r/all")
subreddits.append("/r/other")
+ subreddits.append("/admin")
return subreddits
def get_posts_from_select(cursor, select, binds):
diff --git a/app/config.py b/app/config.py
index eb1d488..f1881b6 100644
--- a/app/config.py
+++ b/app/config.py
@@ -1,27 +1,4 @@
# Scheduler configuration
-max_posts_per_pull = 100
-pull_by = "day"
-subreddits = [
- # name, minimum upvotes
- ("pcgaming", 50),
- ("gadgets", 10),
- ("Nightreign", 100),
- ("CuratedTumblr", 100),
- ("196", 100),
- ("PoliticalCompassMemes", 100),
- ("meirl", 100),
- ("me_irl", 100),
- ("Fauxmoi", 100),
- ("NoFilterNews", 100),
- ("linux", 100),
- ("linux4noobs", 100),
- ("selfhosted", 100),
- ("HomeServer", 100),
- ("homelab", 100),
- ("NonPoliticalTwitter", 100),
- ("comics", 100),
- ("all", 1000)
-]
max_age_days = 30
max_age_seconds = max_age_days * 24 * 60 * 60
other_posts_cutoff = 1 #subreddits with this many unread posts or fewer are merged to /r/other
diff --git a/app/make_db.py b/app/make_db.py
index 8450450..6edc0f6 100755
--- a/app/make_db.py
+++ b/app/make_db.py
@@ -8,5 +8,6 @@ connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS post(permalink primary key, subreddit, created_utc, score, media_fetched, post, hidden)")
cursor.execute("CREATE TABLE IF NOT EXISTS media(permalink, url , local, PRIMARY KEY (permalink, url))")
+cursor.execute("CREATE TABLE IF NOT EXISTS subreddit(subreddit primary key, minimum_score, fetch_by, fetch_max)")
connection.commit()
connection.close()
\ No newline at end of file
diff --git a/app/scrape_posts.py b/app/scrape_posts.py
index fdef9c2..e04f8b7 100755
--- a/app/scrape_posts.py
+++ b/app/scrape_posts.py
@@ -18,102 +18,110 @@ from yars.utils import download_image
miner = YARS()
# Function to scrape subreddit post details and save to JSON
-def scrape_subreddit_data(subreddit, limit=5):
- ret = []
- subreddit_name = subreddit[0]
- minimum_score = subreddit[1]
- print(f"Starting {subreddit_name}")
- empty = dict()
- try:
- subreddit_posts = miner.fetch_subreddit_posts(
- subreddit_name, limit=limit, category="top", time_filter=config.pull_by
- )
- for i, post in enumerate(subreddit_posts, 1):
- score = post.get("score", 0)
- if score < minimum_score:
- continue
- post_data = {
- "permalink": post.get("permalink"),
- "title": post.get("title", ""),
- "author": post.get("author", ""),
- "created_utc": post.get("created_utc", ""),
- "num_comments": post.get("num_comments", 0),
- "score": post.get("score", 0),
- "media_urls" : post.get("media_urls", []),
- "body": post.get("body", None),
- }
- ret.append(post_data)
- print(f"Finished {subreddit_name}")
- return ret
- except Exception as e:
- print(f"Error occurred while scraping subreddit: {e}")
- return ret
+def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5):
+ ret = []
+ print(f"Starting {subreddit} with min score {minimum_score}, by {pull_by}, limit {limit}")
+ empty = dict()
+ try:
+ subreddit_posts = miner.fetch_subreddit_posts(
+ subreddit, limit=limit, category="top", time_filter=pull_by
+ )
+ for i, post in enumerate(subreddit_posts, 1):
+ score = post.get("score", 0)
+ if score < minimum_score:
+ break
+ post_data = {
+ "permalink": post.get("permalink"),
+ "title": post.get("title", ""),
+ "author": post.get("author", ""),
+ "created_utc": post.get("created_utc", ""),
+ "num_comments": post.get("num_comments", 0),
+ "score": post.get("score", 0),
+ "media_urls" : post.get("media_urls", []),
+ "body": post.get("body", None),
+ }
+ ret.append(post_data)
+ print(f"Finished {subreddit}")
+ return ret
+ except Exception as e:
+ print(f"Error occurred while scraping subreddit: {e}")
+ return ret
def save_posts_to_db(data, cursor):
- if len(data)==0:
- return
- upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
- upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
- upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
- binds = []
- for post in data:
- binds.append(post["permalink"])
- m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
- binds.append(m.group(1)) #subreddit
- binds.append(post["created_utc"])
- binds.append(post["score"])
- binds.append(False)
- binds.append(json.dumps(post))
- binds.append(False)
- cursor.execute(upsert, binds)
+ if len(data)==0:
+ return
+ upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
+ upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
+ upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
+ binds = []
+ for post in data:
+ binds.append(post["permalink"])
+ m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
+ binds.append(m.group(1)) #subreddit
+ binds.append(post["created_utc"])
+ binds.append(post["score"])
+ binds.append(False)
+ binds.append(json.dumps(post))
+ binds.append(False)
+ cursor.execute(upsert, binds)
def download_media(cursor):
- select = "SELECT post FROM post WHERE media_fetched = ? AND hidden = ?"
- binds = [False, False]
- results = cursor.execute(select, binds)
- post = results.fetchone()
- binds = []
- while post is not None:
- post = json.loads(post[0])
- if len(post["media_urls"])>0:
- for url in post["media_urls"]:
- binds.append(post["permalink"])
- binds.append(url)
- path = download_image(url, config.media_dir)
- binds.append(path)
- print(f"Downloaded {path}")
- post = results.fetchone()
-
- if len(binds)>0:
- upsert = "INSERT INTO media(permalink, url, local) VALUES "
- upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
- upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
- cursor.execute(upsert, binds)
+ select = "SELECT post FROM post WHERE media_fetched = ? AND hidden = ?"
+ binds = [False, False]
+ results = cursor.execute(select, binds)
+ post = results.fetchone()
+ binds = []
+ while post is not None:
+ post = json.loads(post[0])
+ if len(post["media_urls"])>0:
+ for url in post["media_urls"]:
+ binds.append(post["permalink"])
+ binds.append(url)
+ path = download_image(url, config.media_dir)
+ binds.append(path)
+ print(f"Downloaded {path}")
+ post = results.fetchone()
+
+ if len(binds)>0:
+ upsert = "INSERT INTO media(permalink, url, local) VALUES "
+ upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
+ upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
+ cursor.execute(upsert, binds)
- update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
- binds = [True, False]
- cursor.execute(update, binds)
+ update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
+ binds = [True, False]
+ cursor.execute(update, binds)
def download_comments_for_permalink(permalink, cursor):
- # Currently unused
- post_details = miner.scrape_post_details(permalink)
- update = "UPDATE post SET body = ? WHERE permalink = ?"
- binds = [post_details["body"], permalink]
- cursor.execute(update, binds)
+ # Currently unused
+ post_details = miner.scrape_post_details(permalink)
+ update = "UPDATE post SET body = ? WHERE permalink = ?"
+ binds = [post_details["body"], permalink]
+ cursor.execute(update, binds)
- upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
- binds = [permalink, post_details["comments"]]
- cursor.execute(upsert, binds)
+ upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
+ binds = [permalink, post_details["comments"]]
+ cursor.execute(upsert, binds)
# Main execution
if __name__ == "__main__":
- os.makedirs(config.media_dir, exist_ok=True)
- connection = sqlite3.connect(config.db_file)
- cursor = connection.cursor()
- for subreddit in config.subreddits:
- post_data = scrape_subreddit_data(subreddit, config.max_posts_per_pull)
- save_posts_to_db(post_data, cursor)
- connection.commit()
- download_media(cursor)
- connection.commit()
- connection.close()
\ No newline at end of file
+ os.makedirs(config.media_dir, exist_ok=True)
+ connection = sqlite3.connect(config.db_file)
+ cursor = connection.cursor()
+ select = """
+ SELECT
+ subreddit,
+ minimum_score,
+ fetch_by,
+ fetch_max
+ FROM
+ subreddit
+ """
+ subreddits = cursor.execute(select).fetchall()
+ for subreddit in subreddits:
+ post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3])
+ save_posts_to_db(post_data, cursor)
+ connection.commit()
+ download_media(cursor)
+ connection.commit()
+ connection.close()
\ No newline at end of file
diff --git a/app/templates/admin.html b/app/templates/admin.html
new file mode 100755
index 0000000..77a9b58
--- /dev/null
+++ b/app/templates/admin.html
@@ -0,0 +1,228 @@
+
+
+
+
+
+ Reddit, but better
+
+
+
+
+
+
+
Admin Panel
+
+
Subreddits
+
+
+ | Subreddit |
+ Minimum Score |
+ Fetch by |
+ Fetch max |
+ Update |
+
+ {% for subreddit in sub_subreddits %}
+
+ | /r/{{ subreddit[0] }} |
+ {{ subreddit[1] }} |
+ {{ subreddit[2] }} |
+ {{ subreddit[3] }} |
+ |
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+