163 lines
4.7 KiB
Python
Executable File
163 lines
4.7 KiB
Python
Executable File
import config
|
|
import datetime
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = config.yars_dir
|
|
src_path = os.path.join(project_root, "src")
|
|
sys.path.append(src_path)
|
|
|
|
from yars.yars import YARS
|
|
from yars.utils import download_image
|
|
|
|
# Initialize the YARS Reddit miner
|
|
miner = YARS()
|
|
|
|
# Function to scrape subreddit post details and save to JSON
|
|
def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5, blocked_subs=[], blocked_users=[], blocked_words=[]):
|
|
ret = []
|
|
print(f"Starting {subreddit}")
|
|
empty = dict()
|
|
try:
|
|
subreddit_posts = miner.fetch_subreddit_posts(
|
|
subreddit, limit=limit, category="top", time_filter=pull_by
|
|
)
|
|
for i, post in enumerate(subreddit_posts, 1):
|
|
score = post.get("score", 0)
|
|
if score < minimum_score:
|
|
break
|
|
post_data = {
|
|
"permalink": post.get("permalink"),
|
|
"subreddit": post.get("subreddit"),
|
|
"title": post.get("title", ""),
|
|
"author": post.get("author", ""),
|
|
"created_utc": post.get("created_utc", ""),
|
|
"num_comments": post.get("num_comments", 0),
|
|
"score": post.get("score", 0),
|
|
"media_urls" : post.get("media_urls", []),
|
|
"body": post.get("body", None),
|
|
}
|
|
if post_data["subreddit"] in blocked_subs:
|
|
continue
|
|
if post_data["author"] in blocked_users:
|
|
continue
|
|
for word in blocked_words:
|
|
if word in post_data["title"]:
|
|
break
|
|
if post_data["body"] is not None and word in post_data["body"]:
|
|
break
|
|
else:
|
|
ret.append(post_data)
|
|
print(f"Finished {subreddit}")
|
|
return ret
|
|
except Exception as e:
|
|
print(f"Error occurred while scraping subreddit: {e}")
|
|
return ret
|
|
|
|
def save_posts_to_db(data, cursor):
|
|
if len(data)==0:
|
|
return
|
|
upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden, saved, author) VALUES "
|
|
upsert += ",".join(["(?,?,?,?,?,?,?,?,?)"] * len(data))
|
|
upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
|
|
binds = []
|
|
for post in data:
|
|
binds.append(post["permalink"])
|
|
binds.append(post["subreddit"])
|
|
binds.append(post["created_utc"])
|
|
binds.append(post["score"])
|
|
binds.append(False)
|
|
binds.append(json.dumps(post))
|
|
binds.append(False)
|
|
binds.append(False)
|
|
binds.append(post["author"])
|
|
cursor.execute(upsert, binds)
|
|
|
|
def download_media(cursor):
|
|
select = "SELECT post FROM post WHERE media_fetched = ? AND hidden = ?"
|
|
binds = [False, False]
|
|
results = cursor.execute(select, binds)
|
|
post = results.fetchone()
|
|
downloads = {}
|
|
binds = []
|
|
while post is not None:
|
|
post = json.loads(post[0])
|
|
if len(post["media_urls"])>0:
|
|
for url in post["media_urls"]:
|
|
binds.append(post["permalink"])
|
|
binds.append(url)
|
|
if url not in downloads:
|
|
downloads[url] = download_image(url, config.media_dir)
|
|
path = downloads[url]
|
|
binds.append(path)
|
|
print(f"Downloaded {path}")
|
|
post = results.fetchone()
|
|
|
|
if len(binds)>0:
|
|
upsert = "INSERT INTO media(permalink, url, local) VALUES "
|
|
upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
|
|
upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
|
|
cursor.execute(upsert, binds)
|
|
|
|
update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
|
|
binds = [True, False]
|
|
cursor.execute(update, binds)
|
|
|
|
def download_comments_for_permalink(permalink, cursor):
|
|
# Currently unused
|
|
post_details = miner.scrape_post_details(permalink)
|
|
update = "UPDATE post SET body = ? WHERE permalink = ?"
|
|
binds = [post_details["body"], permalink]
|
|
cursor.execute(update, binds)
|
|
|
|
upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
|
|
binds = [permalink, post_details["comments"]]
|
|
cursor.execute(upsert, binds)
|
|
|
|
def get_blocks(cursor):
|
|
select = """
|
|
SELECT
|
|
name
|
|
FROM
|
|
block
|
|
"""
|
|
blocks = [row[0] for row in cursor.execute(select)]
|
|
subs = []
|
|
users = []
|
|
words = []
|
|
for block in blocks:
|
|
if "/r/" in block:
|
|
subs.append(block[3:])
|
|
elif "/u/" in block:
|
|
users.append(block[3:])
|
|
else:
|
|
words.append(block)
|
|
return subs, users, words
|
|
|
|
# Main execution
|
|
if __name__ == "__main__":
|
|
os.makedirs(config.media_dir, exist_ok=True)
|
|
connection = sqlite3.connect(config.db_file)
|
|
cursor = connection.cursor()
|
|
select = """
|
|
SELECT
|
|
subreddit,
|
|
minimum_score,
|
|
fetch_by,
|
|
fetch_max
|
|
FROM
|
|
subreddit
|
|
"""
|
|
subreddits = cursor.execute(select).fetchall()
|
|
blocked_subs, blocked_users, blocked_words = get_blocks(cursor)
|
|
for subreddit in subreddits:
|
|
post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3], blocked_subs, blocked_users, blocked_words)
|
|
save_posts_to_db(post_data, cursor)
|
|
connection.commit()
|
|
download_media(cursor)
|
|
connection.commit()
|
|
connection.close() |