Stads_Scraping/RequestCrawler.py

#!/usr/bin/python3
import sys
from bs4 import BeautifulSoup
import requests
from requests.auth import HTTPBasicAuth
import io
import os
import time
from twitter import *
import tweepy
import difflib
# This imports from a file called 'constants'. This is at the moment, the file called 'constants_template'
import constants
import mail_handler

# Open a session for requests, which will be used throughout
session = requests.session()

# The config for being able to utilize twitter
cfg = { 
    "consumer_key"        : constants.cons_key,
    "consumer_secret"     : constants.cons_secret,
    "access_token"        : constants.access_token,
    "access_token_secret" : constants.access_token_secret 
}


# Setting the Twitter connection up
def get_api(cfg):
    auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret'])
    auth.set_access_token(cfg['access_token'], cfg['access_token_secret'])
    return tweepy.API(auth)

api = get_api(cfg)

 
# The main method for scraping Stads
def findGrades():

    # Calling the first get to Stads, such that I can get the correct link to follow
    stads = session.get('https://sbstads.au.dk/sb_STAP/sb/resultater/studresultater.jsp')

    # Getting the response with a meta tag, that I can then follow
    soup = BeautifulSoup(stads.text, 'html5lib')

    # Finding said meta tag
    meta_tag_redirect = session.get(soup.find('meta')['content'][6:])

    # This should return 200, since I hopefully found the correct meta tag 
    print(meta_tag_redirect.status_code)

    # Getting the url of the meta tag
    meta_tag_url = meta_tag_redirect.url

    # Trying to log in to WAYF
    wayf_login = session.post(meta_tag_url, data={'username':constants.USERNAME,'password':constants.PASSWORD})

    # Should return 200
    print(wayf_login.status_code)

    soup = BeautifulSoup(wayf_login.text, 'html5lib')

    
    # Finding SAMLResponse, such that I can parse it as a parameter, so WAYF will like me
    SAMLResponse = soup.find('input', {'name':'SAMLResponse'})['value']

    # Hopefully WAYF does in fact like me
    wayf = session.post('https://wayf.wayf.dk/module.php/saml/sp/saml2-acs.php/wayf.wayf.dk', data={'SAMLResponse':SAMLResponse})

    # If this returns 200, it does \o/
    print(wayf.status_code)

    # After concluding that WAYF liked me, we look at the response of WAYF 
    soup = BeautifulSoup(wayf.text, 'html5lib')

    # We then find the new SAMLResponse as well as a string 'RelayState'
    SAMLResponse = soup.find('input', {'name':'SAMLResponse'})['value']
    RelayState = soup.find('input', {'name':'RelayState'})['value']

    # We then do the last post, and after this, hopefully we can 'get' Stads
    SAMLAssertion = session.post('https://sbstads.au.dk/sb_STAP/saml/SAMLAssertionConsumer', data={'SAMLResponse':SAMLResponse,'RelayState':RelayState})

    # If this returns 200, it's go time!
    print(SAMLAssertion.status_code)

    # Given that the previous print returned 200, we can now get the source code of Stads
    resultater = session.get('https://sbstads.au.dk/sb_STAP/sb/resultater/studresultater.jsp')

    # Just to check that it returns 200, so we have access
    print(resultater.status_code)

    # Given that it returned 200, we can now get the source code and thus continue in our adventure to find the grades
    soup = BeautifulSoup(resultater.text, 'html5lib')

    return soup

   
def createGradeFile():
    soup = findGrades()

    newest_grades = soup.find_all('tr', {'class':'DataSelect'})
    
    if os.path.isfile('./temp_new_grades.log'):
        grade_file = open('temp_new_grades.log', 'r+')
    else:
        grade_file = open('temp_new_grades.log', 'w+')    

        
    for grade in newest_grades:
        grade_file.truncate()
        grade_file.write('%s \n' %str.strip(grade.find_all('td')[0].getText()))

    grade_file.close()

def diffGradeLists():

    createGradeFile()
    
    new_grade_file = open('./temp_new_grades.log', 'r+')
    old_grade_file = open('./old_grades.log', 'r+')

    diff = difflib.unified_diff(old_grade_file.readlines(), new_grade_file.readlines(), fromfile='file1', tofile='file2', lineterm="\n", n=0)
    lines = list(diff)[2:]
    added = [line[1:] for line in lines if line[0] == '+']
    removed = [line[1:] for line in lines if line[0] == '-']

    new_courses = []
    
    for line in added:
            if line not in removed:
                new_courses.append(line)
    
    new_grade_file.close()
    old_grade_file.close()
    return new_courses

def checker():

    new_grades = diffGradeLists()
    new_grade_file = open('./temp_new_grades.log', 'r+')
    old_grade_file = open('./old_grades.log', 'r+')

    grades_string = "New grade(s) in the following course(s):\n"
    
    if not new_grades:
        print('There are no new grades')
    else:      
        with new_grade_file:
            with old_grade_file:
                for line in new_grade_file:
                    old_grade_file.write(line) 
        for i in range (0, len(new_grades)):
            grades_string += (new_grades[i])
        print(grades_string)
        mail_handler.handle(grades_string)
        tweeter(grades_string)

    new_grade_file.close()
    old_grade_file.close()
                
def tweeter(grades):
    tweet = '{:s}'.format(grades)
    api.update_status(status=tweet)


checker()
Had to recreate project 2017-04-16 19:06:02 +00:00			`#!/usr/bin/python3`
			`import sys`
			`from bs4 import BeautifulSoup`
			`import requests`
			`from requests.auth import HTTPBasicAuth`
			`import io`
			`import os`
			`import time`
			`from twitter import *`
			`import tweepy`
			`import difflib`
			`# This imports from a file called 'constants'. This is at the moment, the file called 'constants_template'`
			`import constants`
			`import mail_handler`

			`# Open a session for requests, which will be used throughout`
			`session = requests.session()`

			`# The config for being able to utilize twitter`
			`cfg = {`
			`"consumer_key" : constants.cons_key,`
			`"consumer_secret" : constants.cons_secret,`
			`"access_token" : constants.access_token,`
			`"access_token_secret" : constants.access_token_secret`
			`}`


			`# Setting the Twitter connection up`
			`def get_api(cfg):`
			`auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret'])`
			`auth.set_access_token(cfg['access_token'], cfg['access_token_secret'])`
			`return tweepy.API(auth)`

			`api = get_api(cfg)`


			`# The main method for scraping Stads`
			`def findGrades():`

			`# Calling the first get to Stads, such that I can get the correct link to follow`
			`stads = session.get('https://sbstads.au.dk/sb_STAP/sb/resultater/studresultater.jsp')`

			`# Getting the response with a meta tag, that I can then follow`
			`soup = BeautifulSoup(stads.text, 'html5lib')`

			`# Finding said meta tag`
			`meta_tag_redirect = session.get(soup.find('meta')['content'][6:])`

			`# This should return 200, since I hopefully found the correct meta tag`
			`print(meta_tag_redirect.status_code)`

			`# Getting the url of the meta tag`
			`meta_tag_url = meta_tag_redirect.url`

			`# Trying to log in to WAYF`
			`wayf_login = session.post(meta_tag_url, data={'username':constants.USERNAME,'password':constants.PASSWORD})`

			`# Should return 200`
			`print(wayf_login.status_code)`

			`soup = BeautifulSoup(wayf_login.text, 'html5lib')`


			`# Finding SAMLResponse, such that I can parse it as a parameter, so WAYF will like me`
			`SAMLResponse = soup.find('input', {'name':'SAMLResponse'})['value']`

			`# Hopefully WAYF does in fact like me`
			`wayf = session.post('https://wayf.wayf.dk/module.php/saml/sp/saml2-acs.php/wayf.wayf.dk', data={'SAMLResponse':SAMLResponse})`

			`# If this returns 200, it does \o/`
			`print(wayf.status_code)`

			`# After concluding that WAYF liked me, we look at the response of WAYF`
			`soup = BeautifulSoup(wayf.text, 'html5lib')`

			`# We then find the new SAMLResponse as well as a string 'RelayState'`
			`SAMLResponse = soup.find('input', {'name':'SAMLResponse'})['value']`
			`RelayState = soup.find('input', {'name':'RelayState'})['value']`

			`# We then do the last post, and after this, hopefully we can 'get' Stads`
			`SAMLAssertion = session.post('https://sbstads.au.dk/sb_STAP/saml/SAMLAssertionConsumer', data={'SAMLResponse':SAMLResponse,'RelayState':RelayState})`

			`# If this returns 200, it's go time!`
			`print(SAMLAssertion.status_code)`

			`# Given that the previous print returned 200, we can now get the source code of Stads`
			`resultater = session.get('https://sbstads.au.dk/sb_STAP/sb/resultater/studresultater.jsp')`

			`# Just to check that it returns 200, so we have access`
			`print(resultater.status_code)`

			`# Given that it returned 200, we can now get the source code and thus continue in our adventure to find the grades`
			`soup = BeautifulSoup(resultater.text, 'html5lib')`

			`return soup`



			`def createGradeFile():`
			`soup = findGrades()`

			`newest_grades = soup.find_all('tr', {'class':'DataSelect'})`

			`if os.path.isfile('./temp_new_grades.log'):`
			`grade_file = open('temp_new_grades.log', 'r+')`
			`else:`
			`grade_file = open('temp_new_grades.log', 'w+')`


			`for grade in newest_grades:`
			`grade_file.truncate()`
			`grade_file.write('%s \n' %str.strip(grade.find_all('td')[0].getText()))`

			`grade_file.close()`

			`def diffGradeLists():`

			`createGradeFile()`

			`new_grade_file = open('./temp_new_grades.log', 'r+')`
			`old_grade_file = open('./old_grades.log', 'r+')`

			`diff = difflib.unified_diff(old_grade_file.readlines(), new_grade_file.readlines(), fromfile='file1', tofile='file2', lineterm="\n", n=0)`
			`lines = list(diff)[2:]`
			`added = [line[1:] for line in lines if line[0] == '+']`
			`removed = [line[1:] for line in lines if line[0] == '-']`

			`new_courses = []`

			`for line in added:`
			`if line not in removed:`
			`new_courses.append(line)`

			`new_grade_file.close()`
			`old_grade_file.close()`
			`return new_courses`

			`def checker():`

			`new_grades = diffGradeLists()`
			`new_grade_file = open('./temp_new_grades.log', 'r+')`
			`old_grade_file = open('./old_grades.log', 'r+')`

			`grades_string = "New grade(s) in the following course(s):\n"`

			`if not new_grades:`
			`print('There are no new grades')`
			`else:`
			`with new_grade_file:`
			`with old_grade_file:`
			`for line in new_grade_file:`
			`old_grade_file.write(line)`
			`for i in range (0, len(new_grades)):`
			`grades_string += (new_grades[i])`
			`print(grades_string)`
			`mail_handler.handle(grades_string)`
			`tweeter(grades_string)`

			`new_grade_file.close()`
			`old_grade_file.close()`

			`def tweeter(grades):`
			`tweet = '{:s}'.format(grades)`
			`api.update_status(status=tweet)`


			`checker()`