For each headline listed on fivethirtyeight/politics/features, at the top, then under "Latest Politics", we store the type of post, its title, its url, the author(s), the date and time posted, a list of the article's tags, according to 538, and the number of comments. We scrape all headlines from the features pages, except for the live-blogs, which don't have any comments. The hardest part to scrape is the number of comments, since 538 uses the Facebook comments plugin. First we import the necessary python modules.
# Get the html
import requests
# Parse the html
from bs4 import BeautifulSoup
# Render JavaScript to scrape the comments
from selenium import webdriver
from selenium.webdriver.common.by import By
# Delays and time for execution of the code
import time # for debugging
# Get the date and time
from datetime import datetime
# For splitting using more than one delimiter
import re
# Makes a csv file quickly
import pandas as pd
Since scraping the comments is the hardest part, we write a function that will do it. It only works for posts from fivethirtyeight.com/features. The function takes some time each time it's run, so there are debugging commands to track its progress. Any line in the code with the comment "for debugging" can be commented out.
# Input is the url of one of the features posts on fivethirtyeight.com/politics/features pages.
# Output is the number of comments on the post.
def num_comments_538_post(url):
# Start the timer to time the execution of each iteration of this function
start = time.time() # for debugging
# Function only works when the input is a features article from fivethirtyeight.com
print("Comments scraping current url:", url) # for debugging
# Create a webdriver object with selenium that will get the required html
# Here Chrome will be used, but modifications to the code for other browsers exist
driver = webdriver.Chrome()
# Open the 538 webpage after 10 seconds
time.sleep(10)
driver.get(url)
# Click the expand comments button
driver.find_element(By.CLASS_NAME, "fte-expandable-icon").click()
# Execute the JavaScript after clicking the button
article_html = driver.execute_script("return document.documentElement.outerHTML;")
# Close the 538 webpage
driver.quit()
# Parse the html
article_soup = BeautifulSoup(article_html, "lxml")
# Find the iframe corresponding to the comments
comments_frame = article_soup.find('iframe', attrs = {'data-testid':"fb:comments Facebook Social Plugin"})
# Get the source attribute in the iframe
comments_url = comments_frame['src']
# Redefine the webdriver object (needed to avoid errors)
driver = webdriver.Chrome()
# Open the Facebook comments plugin url
driver.get(comments_url)
# Execute the JavaScript on that page
comments_html = driver.execute_script("return document.documentElement.outerHTML;")
# Close the comments page
#driver.quit()
# Parse the rendered code
comments_soup = BeautifulSoup(comments_html,"lxml")
# Find the element that contains the number of comments
number = comments_soup.find('span', attrs = {'class':"_50f7"}).text.strip(" comments")
print("The number of comments is "+str(number)+".") # for debugging
# End the timer
end = time.time() # for debugging
print("Time elapsed:", end-start, "seconds\n") # for debugging
return number
Now we extract the desired data from each headline under "Latest Politics", including the main article, on the 538 features page(s). In the following code, the authors and tags are originally stored as lists. However, when we convert all the data into a data frame later, we will need the data to have the right shape -- we need it to be a list of lists, with no additional nested lists. For authors and tags we turn the list into a string where the items are separated by semicolons instead of commas. This will make it possible to create a .csv
file with the data.
# Get the date and time to put in the name of the output file
now = datetime.now()
# Set timer for full execution
start_full = time.time() # for debugging
# How many pages of features to extract data from
features_num_pages = 110 #input("How many features pages to scrape? Each has about 10 posts. ")
#print("This code will scrape data from", features_num_pages, "page(s) worth of posts in 538's politics/features section.\n") # for debugging
# Here is where all the data will go
posts = []
# Get the data for each post
for i in range(features_num_pages):
print("\nPage "+str(i+1)+"...\n") # for debugging
# Get the html for each headline
features_url = "https://fivethirtyeight.com/politics/features/page/"+str(i+1)
features_html = requests.get(features_url)
# Parse the html
features_soup = BeautifulSoup(features_html.content)
# Gather the data for each of articles
features = features_soup.find_all('h2', attrs = {'class':["article-title entry-title", "title entry-title"]})
for post in features:
# Get post title from the features page
title = post.a.text.strip('\n''\t')
# Get post url from the features page
url = post.find('a').get('href')
# Screen for live blogs, which don't have comments
if "live-blog" in url:
continue
# Go to the url to get more data
post_code = requests.get(url)
post_soup = BeautifulSoup(post_code.content)
# Get author(s)
author_bios = post_soup.find_all('div', attrs = {'class':"mini-bio"})
if author_bios == []:
authors = "None/All"
else:
authors_list = []
for author in author_bios:
# Extract the author name
to_extract = author.p.text
to_extract_list = re.split(" is | reports", to_extract)
authors_list.append(to_extract_list[0])
authors = str(authors_list).replace(",", ";").strip("[" "]").replace("\'", "")
# Get date and time of post
date = post_soup.find('time').text.strip('\n''\t')
# Get tags
tags_list = []
for tag in post_soup.find_all('a', attrs = {'class':"tag"}):
tags_list.append(tag.text.split(" (")[0])
tags = str(tags_list).replace(",", ";").strip("[" "]").replace("\'", "")
# Use the tags to get the post type
if "Politics Podcast" in tags:
post_type = "podcast"
else:
post_type = post.find('a').get('data-content-type')
if post_type == None:
post_type = "feature"
# Change the name "feature" to "article"
if post_type == "feature":
post_type = "article"
# Get number of comments
num_comments = num_comments_538_post(url)
# Add all attributes to list
posts.append([post_type, title, url, authors, date, tags, num_comments])
if len(posts) == 1000:
break
# End the timer for the full execution
end_full = time.time() # for debugging
# Compute time elapsed in seconds
total_time_seconds = end_full-start_full # for debugging
# In minutes
total_time_minutes = total_time_seconds/60 # for debugging
if total_time_minutes < 60: # for debugging
print("Total time elapsed =", total_time_minutes, "minutes") # for debugging
else: # for debugging
# In hours
total_time_hours = total_time_minutes/60 # for debugging
# Print the time elapsed in hours
print("Total time elapsed =", total_time_hours, "hours") # for debugging
# The data
print("Number of posts scraped:", len(posts)) # for debugging
#posts # for debugging
Now we save the data frame to a .csv
file to use in the data exploration phase.
# Use pandas to make a data frame
df = pd.DataFrame(posts)
df.columns = ["Post type", "Title", "Post url", "Author(s)", "Date and time posted", "Tags", "No. of comments"]
# Then save it as a .csv file, with the index column removed
df.to_csv("ProblemStatementOutputs/"+str(len(posts))+"_"+now.strftime("%d-%m-%Y_%H-%M-%S")+".csv", index = False)
The name of the file has the form (number of posts)_(date)-(month)-(year in 4 digits)_(hour in military time)-(minute)-(seconds).
Which 538 features posts get the most traffic?
News has become more polarized and sensationalized in recent years, all in the name of more clicks. This data analysis could provide some insight into what kind of articles and other content (podcasts and videos) get more traffic, without news organizations having to compromise their neutrality and factual correctness.