Stads_Scraping/RequestCrawler.py

170 lines
5.2 KiB
Python
Raw Normal View History

2017-04-16 19:06:02 +00:00
#!/usr/bin/python3
import sys
from bs4 import BeautifulSoup
import requests
from requests.auth import HTTPBasicAuth
import io
import os
import time
from twitter import *
import tweepy
import difflib
# This imports from a file called 'constants'. This is at the moment, the file called 'constants_template'
import constants
import mail_handler
# Open a session for requests, which will be used throughout
session = requests.session()
# The config for being able to utilize twitter
cfg = {
"consumer_key" : constants.cons_key,
"consumer_secret" : constants.cons_secret,
"access_token" : constants.access_token,
"access_token_secret" : constants.access_token_secret
}
# Setting the Twitter connection up
def get_api(cfg):
auth = tweepy.OAuthHandler(cfg['consumer_key'], cfg['consumer_secret'])
auth.set_access_token(cfg['access_token'], cfg['access_token_secret'])
return tweepy.API(auth)
api = get_api(cfg)
# The main method for scraping Stads
def findGrades():
# Calling the first get to Stads, such that I can get the correct link to follow
stads = session.get('https://sbstads.au.dk/sb_STAP/sb/resultater/studresultater.jsp')
# Getting the response with a meta tag, that I can then follow
soup = BeautifulSoup(stads.text, 'html5lib')
# Finding said meta tag
meta_tag_redirect = session.get(soup.find('meta')['content'][6:])
# This should return 200, since I hopefully found the correct meta tag
print(meta_tag_redirect.status_code)
# Getting the url of the meta tag
meta_tag_url = meta_tag_redirect.url
# Trying to log in to WAYF
wayf_login = session.post(meta_tag_url, data={'username':constants.USERNAME,'password':constants.PASSWORD})
# Should return 200
print(wayf_login.status_code)
soup = BeautifulSoup(wayf_login.text, 'html5lib')
# Finding SAMLResponse, such that I can parse it as a parameter, so WAYF will like me
SAMLResponse = soup.find('input', {'name':'SAMLResponse'})['value']
# Hopefully WAYF does in fact like me
wayf = session.post('https://wayf.wayf.dk/module.php/saml/sp/saml2-acs.php/wayf.wayf.dk', data={'SAMLResponse':SAMLResponse})
# If this returns 200, it does \o/
print(wayf.status_code)
# After concluding that WAYF liked me, we look at the response of WAYF
soup = BeautifulSoup(wayf.text, 'html5lib')
# We then find the new SAMLResponse as well as a string 'RelayState'
SAMLResponse = soup.find('input', {'name':'SAMLResponse'})['value']
RelayState = soup.find('input', {'name':'RelayState'})['value']
# We then do the last post, and after this, hopefully we can 'get' Stads
SAMLAssertion = session.post('https://sbstads.au.dk/sb_STAP/saml/SAMLAssertionConsumer', data={'SAMLResponse':SAMLResponse,'RelayState':RelayState})
# If this returns 200, it's go time!
print(SAMLAssertion.status_code)
# Given that the previous print returned 200, we can now get the source code of Stads
resultater = session.get('https://sbstads.au.dk/sb_STAP/sb/resultater/studresultater.jsp')
# Just to check that it returns 200, so we have access
print(resultater.status_code)
# Given that it returned 200, we can now get the source code and thus continue in our adventure to find the grades
soup = BeautifulSoup(resultater.text, 'html5lib')
return soup
def createGradeFile():
soup = findGrades()
newest_grades = soup.find_all('tr', {'class':'DataSelect'})
if os.path.isfile('./temp_new_grades.log'):
grade_file = open('temp_new_grades.log', 'r+')
else:
grade_file = open('temp_new_grades.log', 'w+')
for grade in newest_grades:
grade_file.truncate()
grade_file.write('%s \n' %str.strip(grade.find_all('td')[0].getText()))
grade_file.close()
def diffGradeLists():
createGradeFile()
new_grade_file = open('./temp_new_grades.log', 'r+')
old_grade_file = open('./old_grades.log', 'r+')
diff = difflib.unified_diff(old_grade_file.readlines(), new_grade_file.readlines(), fromfile='file1', tofile='file2', lineterm="\n", n=0)
lines = list(diff)[2:]
added = [line[1:] for line in lines if line[0] == '+']
removed = [line[1:] for line in lines if line[0] == '-']
new_courses = []
for line in added:
if line not in removed:
new_courses.append(line)
new_grade_file.close()
old_grade_file.close()
return new_courses
def checker():
new_grades = diffGradeLists()
new_grade_file = open('./temp_new_grades.log', 'r+')
old_grade_file = open('./old_grades.log', 'r+')
grades_string = "New grade(s) in the following course(s):\n"
if not new_grades:
print('There are no new grades')
else:
with new_grade_file:
with old_grade_file:
for line in new_grade_file:
old_grade_file.write(line)
for i in range (0, len(new_grades)):
grades_string += (new_grades[i])
print(grades_string)
mail_handler.handle(grades_string)
tweeter(grades_string)
new_grade_file.close()
old_grade_file.close()
def tweeter(grades):
tweet = '{:s}'.format(grades)
api.update_status(status=tweet)
checker()