Going fast! #DataquestChallenge Premium Annual Offer:
500 get 50% & the next 1000 get 40% off.

Guided Project 2 Book Profitability R Solutions EW

library(tidyverse) #load packages
Book_Reviews ← read_csv("/Users/Documents/R_Files/DataQuest_Files/Guided_Projects/Project_Data/book_reviews.csv") #import data

#what are the dimensions of the data
dim(Book_Reviews) #2,000 rows and 4 columns

#what are the column names
colnames(Book_Reviews) #book, review, state and price

typeof(Book_Reviews) #data is a list
col_length ← length(Book_Reviews) #figures out amount to iterate

iteration_vector ← c(1:length(Book_Reviews))#gets length to iterate
all_types ← c() #creates empty vector to store values so it can be printed

for (i in iteration_vector) { #makes up variable to iterate over the length
datatype ← typeof(Book_Reviews[[i]]) #returns the data type for each element (no commas bc it is a list)
all_types ← c(all_types, datatype) #adds data type to empty vector

print(all_types) #everything is a character except for price

#reviewing data to find issues
unique(Book_Reviews$review) #categorical column can’t take avg
unique(Book_Reviews$state) #abbreviations and spelling for same state
mean(Book_Reviews$price) #31.28
max(Book_Reviews$price) #50
min(Book_Reviews$price) #15.99

Book_Reviews_Mod ← Book_Reviews >
filter(!is.na(review)) #filter out NAs. We know review is the only column with them from the unique

dim(Book_Reviews_Mod) #reduced data by 206 out of 2000 rows, not bad

#code below combines lots of actions into one string of code
Book_Reviews_Mod ← Book_Reviews >
filter(!is.na(review)) > #part 1 filter out NAs from original data
mutate(corrected_state = #part 2 correcting the state column (creates new column)
state == “Texas” ~ “TX”,
state == “TX” ~ “TX”,
state == “NY” ~ “NY”,
state == “FL” ~ “FL”,
state == “California” ~ “CA”,
state == “CA” ~ “CA”,
state == “Florida” ~ “FL”,
state == “New York” ~ “NY”),
review_num = #part 3 updating the reviews to numerical column (creates new column)
review == “Poor” ~ 1,
review == “Fair” ~ 2,
review == “Good” ~ 3,
review == “Great” ~ 4,
review == “Excellent” ~ 5
), is_high = if_else( #part 4 creates another column boolean (true/false) column if review is high
review_num == 4 | review_num == 5, TRUE, FALSE)

Best_Book ← Book_Reviews_Mod >
group_by(book) >
summarise(mean(price), mean(review_num), sum(price), n()) #looking at data to determine the most profitable book

#The goal of this analysis was to determine the most profitable book. Examining the total price secrets of R for advanced students was the most profitable
#this book was also the most expensive and sold the 2nd most copies. Reviews were in the middle of the pack however