Github, Python, Reddit and more

23 Sep 2013

Ever find yourself wondering what the name of that song was you listened to a week ago on YouTube? This happens to me all the time, I browse the /r/electronicmusic subreddit almost everyday to find new music to listen to. I would upvote songs and then find myself a week later attempting to look through my history for songs I liked but it was always a hassle to weed through all my upvoted content just to find songs to listen to.

Knowing Reddit offered an API I thought there was probably a way I could automate reading through my history and listing all the songs I liked saving myself the hours of manual labor (I am very lazy…err, I mean efficient).

I wanted to accomplish three things with this mini project:

  • Write a python script
  • Get myself on GitHub
  • A consistently updated list of good music to listen to

Overview

This python script will call Reddit’s API, parse a users ‘liked’ history for a given subreddit and output a list of HTML links to a file.

Usage

  1. Download the redditlikedlist.py and rll.cfg to a directory of your choice.

  2. Populate the following fields in the rll.cfg file:

  3. Execute the python script

    $> python redditlikedlist.py rll.cfg

The first time the script is run it will start with your most recent liked links and work backwards into the past. Reddit will allow you to parse your most recent 1000 links. After the first run the script stores the most recent link in the ‘beforelinkname’ in the configuration file. Subsequent executions the script will start parsing from this link and work forward to present time

Below I will post the most current version of this script, however please see the GitHub repository for the most up-to-date version.

Configuration File

[PATHS]  
outputfile = (The destination of the output file. Example: /home/John/musiclist.html) 
subreddit = (The subreddit you want to parse from. Example: electronicmusic) 

[WAYPOINT] 
beforelinkname = (leave blank, the script will populate) 

[CREDENTIALS] 
username = (Your Reddit username, must have a valid Reddit account to use this script.) 
password = (Your Reddit password, must have a valid Reddit account to use this script.) 
useragent = (Reddit requires a unique user agent for all calls to its API, 
             it is recommended you incorporate your username in the agent. Example: BobaFett37's Liked List Parse)

Python Script

Python 3+ is required to run this script. I have another mostly working version on GitHub for Python 2.6.6

# This script will login to Reddit, return all liked stories for a given user
# parse all the subreddit likes and build and output for 
# a website listing.
#
import time
import datetime
import urllib.request, urllib.parse, urllib.error
import urllib.request, urllib.error, urllib.parse
import http.cookiejar
import json
import configparser
import logging
import tempfile
import os
import argparse

# Variables
hdr = {}
before_flag = False
link_value = '' 
liked_url =  'https://ssl.reddit.com/user/<username>/liked.json?limit=100&<direction>=<link_name>'
cj = http.cookiejar.CookieJar()
cfg_file = ''
final_file_location = ''
username = ''
password = ''
subreddit = ''
iCounter = 0
tmpfile = tempfile.TemporaryFile()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))

# Parse input
def parse_input_params():
    global cfg_file
    parser = argparse.ArgumentParser(description='Music list built from liked stories within a subreddit')
    parser.add_argument("config_file", help="Configuration file location")
    args = parser.parse_args()
    cfg_file = args.config_file

# Load config file
def get_config():
    config = configparser.SafeConfigParser()
    config.optionxform(str())
    try:
        config.read(cfg_file)
        return config
    except Exception as e:
        logging.error(e)

# Retrieve values from config file
def load_config_values(config):
    try:
        global final_file_location
        global username
        global password
        global subreddit
        global hdr
        global before_flag
        global link_value
        final_file_location = config.get('PATHS','outputFile')
        subreddit = config.get('PATHS','subreddit')
        link_value = config.get('WAYPOINT','beforeLinkName')
        if link_value:
            before_flag = True
        username = config.get('CREDENTIALS','username')
        password = config.get('CREDENTIALS','password')
        hdr['User-Agent'] = config.get('CREDENTIALS','useragent')

    except Exception as e:
        logging.error(e)

# Reddit Login Function
def login(username, passwd):
    values = {'user': username,
              'api_type': 'json',
              'passwd': passwd}

    login_url = urllib.request.Request('https://ssl.reddit.com/api/login/', headers=hdr)
    data = urllib.parse.urlencode(values)
    data_bytes = data.encode('utf-8')
    try:
        response = opener.open(login_url, data_bytes).read()
    except Exception as e:
        logging.error(e)        
        
def process_reddit_data():
    global link_value
    global tmpfile
    global iCounter
    try:
        while (link_value is not None):
            time.sleep(3)
            liked_json = retrieve_liked(username)

            if (before_flag == False):
                link_value = json.loads(liked_json)["data"]["after"]
            else:
                link_value = json.loads(liked_json)["data"]["before"]

            liked_json = json.loads(liked_json)["data"]["children"]
            for titles in liked_json:
                iCounter += 1
                if (iCounter == 1):
                    write_config_values(titles["data"]["name"])

                if(titles["data"]["subreddit"]==subreddit and titles["data"]["media"] is not None):
                    tmpfile.write(bytes('<a href=\''+ titles["data"]["url"] + '\'>' + titles["data"]["title"] + '</a><br/>\n', 'utf-8' ))

    except Exception as e:
        logging.error(e)
        
# Fetch liked content for a user
def retrieve_liked(username):
    try:
        if(before_flag == True):
            direction = 'before'
        else:
            direction = 'after'
        repl = {'<username>':username, '<link_name>':link_value, '<direction>':direction}
        url = replace_all(liked_url, repl)
        url =  urllib.request.Request(url, headers=hdr)
        r = opener.open(url).read()
        response = r.decode('utf-8')
        return response
    except Exception as e:
        logging.error(e)

# Write/Update config file
def write_config_values(before_link):
    try:
        configVal.set('WAYPOINT', 'beforeLinkName', before_link)
        f = open(cfg_file, 'w')
        configVal.write(f)
        f.close
    except Exception as e:
        logging.error(e)

def updated_timestamp():
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%m-%d-%Y %H:%M:%S')
    return("Last updated at: " + st + "<br/><br/>\n")
        
def write_output():
    global tmpfile
    try:
        if os.path.exists(final_file_location):
            #final output file aleady exists, we need to append new data.
            f2 = open(final_file_location, 'r')
            for i in range(1):
                next(f2)
            for line in f2:
                tmpfile.write(bytes(line, 'utf-8'))
            f2.close()

        tmpfile.seek(0)
        f = open(final_file_location, 'wb')
        f.write(bytes(updated_timestamp(), 'utf-8'))
        for line in tmpfile:
            f.write(bytes(line))
        f.close()

        tmpfile.close()
    except Exception as e:
        logging.error(e)

# generic replace text using dict function
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text


###########################################################
# Main Processing
###########################################################
parse_input_params()
configVal = get_config()
load_config_values(configVal)

# Call login and retrieve liked content.  Each call must separated by at least 2 seconds.
login(username, password)
process_reddit_data()
write_output()