Hacker data set: Ask HN or Show HN

In this project, we’ll work with a data set of submissions to popular technology site Hacker News.

We’ll compare these two types of posts to determine the following:

Do Ask HN or Show HN receive more comments on average? Do posts created at a certain time receive more comments on average?

from csv import reader
open_file = open(“hacker_news.csv”)
read_file = reader(open_file)
hn = list(read_file)
hn_header = hn[0]
hn = hn[1:]
#print (hn_header)
#print (hn[:5])

ask_posts =
show_posts =
other_posts =

for row in hn:
title = row[1]
new = title.lower()
if new.startswith(‘ask hn’):
elif new.startswith(‘show hn’):
else: other_posts.append(row)

#print (len(ask_posts))

total_ask_comments = 0
for comm in ask_posts:
num_comments = int(comm[4])
total_ask_comments = total_ask_comments + num_comments
avg_ask_comments = total_ask_comments/len(ask_posts)
#print (total_ask_comments, avg_ask_comments)

total_show_comments = 0
for comm in show_posts:
num_comments = int(comm[4])
total_show_comments = total_show_comments + num_comments
avg_show_comments = total_show_comments/len(show_posts)
#print (total_show_comments, avg_show_comments)

import datetime as dt

count_by_hour = {}
comments_by_hour = {}

for row in ask_posts:
created_at = row[6]
num_com = int(row[4])
hour = dt.datetime.strptime(created_at, ‘%m/%d/%Y %H:%M’)
hour = dt.datetime.strftime(hour, ‘%H’)
if hour not in count_by_hour:
count_by_hour[hour] =1
comments_by_hour[hour] = num_com
count_by_hour[hour] +=1
comments_by_hour[hour] += num_com


print (comments_by_hour)

avg_by_hour =
for a in comments_by_hour:
avg = comments_by_hour[a]/count_by_hour[a]
#print (avg_by_hour)

swap =
for a in avg_by_hour:
swap.append([a[1], a[0]])

sorted_swap = sorted(swap, reverse = True)

for row in sorted_swap[:5]:
txt = “{}:00: {:.2f} average comments per post”
print(txt.format(row[1], row[0]))

conclusion: Ask HN is more popular than Show HN; also people usually comment or answer questions at 15:00 - 16:00.