# Get the html
import requests

# Parse the html
from bs4 import BeautifulSoup

# Render JavaScript to scrape the comments
from selenium import webdriver
from selenium.webdriver.common.by import By

# Delays and time for execution of the code
import time # for debugging

# Get the date and time
from datetime import datetime

# For splitting using more than one delimiter
import re

# Makes a csv file quickly
import pandas as pd


# Input is the url of one of the features posts on fivethirtyeight.com/politics/features pages.
# Output is the number of comments on the post.
def num_comments_538_post(url):
    # Start the timer to time the execution of each iteration of this function
    start = time.time() # for debugging
    # Function only works when the input is a features article from fivethirtyeight.com
    print("Comments scraping current url:", url) # for debugging
    # Create a webdriver object with selenium that will get the required html    
    # Here Chrome will be used, but modifications to the code for other browsers exist
    driver = webdriver.Chrome()
    # Open the 538 webpage after 10 seconds
    time.sleep(10)
    driver.get(url)
    # Click the expand comments button
    driver.find_element(By.CLASS_NAME, "fte-expandable-icon").click()
    # Execute the JavaScript after clicking the button
    article_html = driver.execute_script("return document.documentElement.outerHTML;")
    # Close the 538 webpage
    driver.quit()
    # Parse the html
    article_soup = BeautifulSoup(article_html, "lxml")
    # Find the iframe corresponding to the comments
    comments_frame = article_soup.find('iframe', attrs = {'data-testid':"fb:comments Facebook Social Plugin"})
    # Get the source attribute in the iframe 
    comments_url = comments_frame['src']
    # Redefine the webdriver object (needed to avoid errors)
    driver = webdriver.Chrome()
    # Open the Facebook comments plugin url
    driver.get(comments_url)
    # Execute the JavaScript on that page
    comments_html = driver.execute_script("return document.documentElement.outerHTML;")
    # Close the comments page
    #driver.quit()
    # Parse the rendered code
    comments_soup = BeautifulSoup(comments_html,"lxml")
    # Find the element that contains the number of comments
    number = comments_soup.find('span',  attrs = {'class':"_50f7"}).text.strip(" comments")
    print("The number of comments is "+str(number)+".") # for debugging
    # End the timer
    end = time.time() # for debugging
    print("Time elapsed:", end-start, "seconds\n") # for debugging 
    return number


# Get the date and time to put in the name of the output file
now = datetime.now()

# Set timer for full execution
start_full = time.time() # for debugging

# How many pages of features to extract data from
features_num_pages = 110 #input("How many features pages to scrape?  Each has about 10 posts.  ")
#print("This code will scrape data from", features_num_pages, "page(s) worth of posts in 538's politics/features section.\n") # for debugging

# Here is where all the data will go
posts = []
# Get the data for each post
for i in range(features_num_pages): 
    print("\nPage "+str(i+1)+"...\n") # for debugging
    # Get the html for each headline
    features_url = "https://fivethirtyeight.com/politics/features/page/"+str(i+1)
    features_html = requests.get(features_url)
    # Parse the html
    features_soup = BeautifulSoup(features_html.content)
    # Gather the data for each of articles
    features = features_soup.find_all('h2', attrs = {'class':["article-title entry-title", "title entry-title"]})
    for post in features:
        # Get post title from the features page
        title = post.a.text.strip('\n''\t')
        # Get post url from the features page
        url = post.find('a').get('href')
        # Screen for live blogs, which don't have comments
        if "live-blog" in url:
            continue
        # Go to the url to get more data
        post_code = requests.get(url)
        post_soup = BeautifulSoup(post_code.content)
        # Get author(s)
        author_bios = post_soup.find_all('div', attrs = {'class':"mini-bio"})
        if author_bios == []:
            authors = "None/All"
        else:    
            authors_list = []
            for author in author_bios:
                # Extract the author name
                to_extract = author.p.text
                to_extract_list = re.split(" is | reports", to_extract)
                authors_list.append(to_extract_list[0])
            authors = str(authors_list).replace(",", ";").strip("[" "]").replace("\'", "")    
        # Get date and time of post
        date = post_soup.find('time').text.strip('\n''\t')
        # Get tags
        tags_list = []
        for tag in post_soup.find_all('a', attrs = {'class':"tag"}):
            tags_list.append(tag.text.split(" (")[0])
        tags = str(tags_list).replace(",", ";").strip("[" "]").replace("\'", "")      
        # Use the tags to get the post type
        if "Politics Podcast" in tags:
            post_type = "podcast"
        else:    
            post_type = post.find('a').get('data-content-type') 
        if post_type == None:
            post_type = "feature"
        # Change the name "feature" to "article"    
        if post_type == "feature":
            post_type = "article"
        # Get number of comments
        num_comments = num_comments_538_post(url)    
        # Add all attributes to list
        posts.append([post_type, title, url, authors, date, tags, num_comments])
    if len(posts) == 1000:
        break

# End the timer for the full execution
end_full = time.time() # for debugging

# Compute time elapsed in seconds
total_time_seconds = end_full-start_full # for debugging
# In minutes 
total_time_minutes = total_time_seconds/60 # for debugging
if total_time_minutes < 60: # for debugging
    print("Total time elapsed =", total_time_minutes, "minutes") # for debugging
else: # for debugging
    # In hours
    total_time_hours = total_time_minutes/60 # for debugging
    # Print the time elapsed in hours
    print("Total time elapsed =", total_time_hours, "hours") # for debugging

# The data
print("Number of posts scraped:", len(posts)) # for debugging
#posts # for debugging


# Use pandas to make a data frame 
df = pd.DataFrame(posts)
df.columns = ["Post type", "Title", "Post url", "Author(s)", "Date and time posted", "Tags", "No. of comments"]
# Then save it as a .csv file, with the index column removed
df.to_csv("ProblemStatementOutputs/"+str(len(posts))+"_"+now.strftime("%d-%m-%Y_%H-%M-%S")+".csv", index = False)

Data gathering, problem statement, stakeholders, KPIs¶

Data gathering¶

Problem statement¶

Stakeholders¶

Key performance indicators (KPIs)¶