118 lines
4.1 KiB
Python
Executable File
118 lines
4.1 KiB
Python
Executable File
import config
|
|
import datetime
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
project_root = config.yars_dir
|
|
src_path = os.path.join(project_root, "src")
|
|
sys.path.append(src_path)
|
|
|
|
from yars.yars import YARS
|
|
from yars.utils import download_image
|
|
|
|
# Initialize the YARS Reddit miner
|
|
miner = YARS()
|
|
|
|
# Function to scrape subreddit post details and save to JSON
|
|
def scrape_subreddit_data(subreddit, limit=5):
|
|
ret = []
|
|
subreddit_name = subreddit[0]
|
|
minimum_score = subreddit[1]
|
|
print(f"Starting {subreddit_name}")
|
|
empty = dict()
|
|
try:
|
|
subreddit_posts = miner.fetch_subreddit_posts(
|
|
subreddit_name, limit=limit, category="top", time_filter=config.pull_by
|
|
)
|
|
for i, post in enumerate(subreddit_posts, 1):
|
|
score = post.get("score", 0)
|
|
if score < minimum_score:
|
|
continue
|
|
post_data = {
|
|
"permalink": post.get("permalink"),
|
|
"title": post.get("title", ""),
|
|
"author": post.get("author", ""),
|
|
"created_utc": post.get("created_utc", ""),
|
|
"num_comments": post.get("num_comments", 0),
|
|
"score": post.get("score", 0),
|
|
"media_urls" : post.get("media_urls", []),
|
|
"body": post.get("body", None),
|
|
}
|
|
ret.append(post_data)
|
|
print(f"Finished {subreddit_name}")
|
|
return ret
|
|
except Exception as e:
|
|
print(f"Error occurred while scraping subreddit: {e}")
|
|
return ret
|
|
|
|
def save_posts_to_db(data, cursor):
|
|
if len(data)==0:
|
|
return
|
|
upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
|
|
upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
|
|
upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
|
|
binds = []
|
|
for post in data:
|
|
binds.append(post["permalink"])
|
|
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
|
|
binds.append(m.group(1)) #subreddit
|
|
binds.append(post["created_utc"])
|
|
binds.append(post["score"])
|
|
binds.append(False)
|
|
binds.append(json.dumps(post))
|
|
binds.append(False)
|
|
cursor.execute(upsert, binds)
|
|
|
|
def download_media(cursor):
|
|
select = "SELECT post FROM post WHERE media_fetched = ?"
|
|
binds = [False]
|
|
results = cursor.execute(select, binds)
|
|
post = results.fetchone()
|
|
binds = []
|
|
while post is not None:
|
|
post = json.loads(post[0])
|
|
if len(post["media_urls"])>0:
|
|
for url in post["media_urls"]:
|
|
binds.append(post["permalink"])
|
|
binds.append(url)
|
|
binds.append(download_image(url, config.media_dir))
|
|
print("image downloaded")
|
|
post = results.fetchone()
|
|
|
|
if len(binds)>0:
|
|
upsert = "INSERT INTO media(permalink, url, local) VALUES "
|
|
upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
|
|
upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
|
|
cursor.execute(upsert, binds)
|
|
|
|
update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
|
|
binds = [True, False]
|
|
cursor.execute(update, binds)
|
|
|
|
def download_comments_for_permalink(permalink, cursor):
|
|
# Currently unused
|
|
post_details = miner.scrape_post_details(permalink)
|
|
update = "UPDATE post SET body = ? WHERE permalink = ?"
|
|
binds = [post_details["body"], permalink]
|
|
cursor.execute(update, binds)
|
|
|
|
upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
|
|
binds = [permalink, post_details["comments"]]
|
|
cursor.execute(upsert, binds)
|
|
|
|
# Main execution
|
|
if __name__ == "__main__":
|
|
os.makedirs(config.media_dir, exist_ok=True)
|
|
connection = sqlite3.connect(config.db_file)
|
|
cursor = connection.cursor()
|
|
for subreddit in config.subreddits:
|
|
post_data = scrape_subreddit_data(subreddit, config.max_posts_per_pull)
|
|
save_posts_to_db(post_data, cursor)
|
|
connection.commit()
|
|
download_media(cursor)
|
|
connection.commit()
|
|
connection.close() |