Hello!
Here’s my solution to the Hacker News Dataset Analysis.
Suggestions? Recommendations?
Thanks!
Lucio
My Code: <!-#!/usr/bin/env python
coding: utf-8
# Hacker News Dataset Analysis
## By: Lucio Gayosso (05/02/20)
In[29]:
opened_file = open(‘hacker_news.csv’)
from csv import reader
read_file = reader(opened_file)
hn = list(read_file)
headers = hn[0]
#print(“Headers: “,headers,”\n”)
#print (hn[1:6])
Create empty lists
ask_posts =
show_posts =
other_posts =
for row in hn:
title = row[1]
title = title.lower()
if title.startswith(‘ask hn’):
ask_posts.append(row)
else:
if title.startswith(‘show hn’):
show_posts.append(row)
else:
other_posts.append(row)
#print(len(ask_posts))
#print(len(show_posts))
#print(len(other_posts))
#print(show_posts[:6])
total_ask_comments = 0
for row in ask_posts:
num_comments = int(row[4])
total_ask_comments += num_comments
Calculate average number of Ask comments
avg_ask_comments = total_ask_comments / len(ask_posts)
#print(total_ask_comments)
#print(len(ask_posts))
print("Average Ask comments: ", avg_ask_comments)
total_show_comments = 0
for row in show_posts:
num_comments = int(row[4])
total_show_comments += num_comments
Calculate average number of Show comments
avg_show_comments = total_show_comments / len(show_posts)
print("Average Show comments: ", avg_show_comments)
## Analysis
Ask HK has more comments in average
In[142]:
import datetime as dt
result_list =
for row in ask_posts:
temp_list =
created_at = row[6]
num_comments = int(row[4])
temp_list.append(created_at)
temp_list.append(num_comments)
result_list.append(temp_list)
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
date_str = row[0]
date = dt.datetime.strptime(date_str,"%m/%d/%Y %H:%M") # parse the string date to create a datetime object
hour = date.strftime("%H")
#print(hour)
if hour not in counts_by_hour:
counts_by_hour[hour] = 1
comments_by_hour[hour] = row[1] # num_comments from previous loop
else:
counts_by_hour[hour] += 1
comments_by_hour[hour] += row[1] # num_comments from previous loop
#print(counts_by_hour)
#print(comments_by_hour)
avg_comments_hr =
for hour in counts_by_hour:
# Avg Comments = Nbr of comments / Counts of posts
avg_comments_hr.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])
avg_by_hour = avg_comments_hr
#print(avg_by_hour)
Arrange the results
swap_avg_by_hour =
for row in avg_by_hour:
temp_list =
first_element = row[0]
second_element = row[1]
temp_list.append(second_element)
temp_list.append(first_element)
swap_avg_by_hour.append(temp_list)
print(swap_avg_by_hour)
#sorted_swap = sorted(swap_avg_by_hour,reverse=True)
print(“Top 5 Hours for Ask Posts Comments:”)
for row in sorted_swap[:5]:
hour_str = dt.datetime.strptime(row[1],"%H")
hour = hour_str.strftime("%H")
avg = row[0]
print("{}:00: {:.2f} average comments per post".format(hour, avg))
# Analysis
The best time to post is 3 pm where there is an average of 38.59 comments/post
–>
What actually happened:
Replace this line with the output/error