Download our e-book of Introduction To Python
Shashank Shanu
2 years ago
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
amazon_reviews = pd.read_csv('amazon_reviews.csv')
amazon_reviews.head()
amazon_reviews.shape
(999, 10)
reviews = pd.DataFrame()
reviews['reviewText'] = amazon_reviews['reviewText']
reviews['overall'] = amazon_reviews['overall']
reviews.head()
reviews.isnone().sum()
reviewText 2
overall 0
dtype: int64
sns.heatmap(reviews.isnone())
reviews = reviews[reviews['reviewText'].notnone()]
reviews.head()
reviews.shape
(997, 2)
combined_reviews = " ".join(reviews['reviewText'])
print(combined_reviews[:1200])
type(combined_reviews)
str
from wordcloud import WordCloud
word_cloud = WordCloud(width = 1000, height = 600, background_color = 'white', max_words = 150).generate(combined_reviews)
plt.figure(figsize = (12,6))
plt.imshow(word_cloud)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
list1 = []
for i in combined_reviews.split():
list1.append(i)
dictionary1 = {}
for j in list1:
dictionary1[j] = dictionary1.get(j,0)+1
series1 = pd.Series(dictionary1)
word_freq = pd.DataFrame(series1)
word_freq = word_freq.reset_index().rename(columns = {'index':'Words', 0:'Frequency'})
word_freq.head()
top_25_words = word_freq.sort_values(ascending = false, by = 'Frequency')
top_25_words.head(25)
last_25_words = word_freq.sort_values(ascending = false, by = 'Frequency')
last_25_words.tail(25)
from nltk.tokenize import word_tokenize
all_words = word_tokenize(combined_reviews.lower())
print(all_words[:200])
from nltk.probability import FreqDist
fdist = FreqDist(all_words)
fdist
FreqDist({'the': 7854, '.': 6677, ',': 5824, 'i': 4497, 'to': 4419, 'and': 4210, 'a': 3900, 'it': 3656, 'is': 2497, 'for': 2097, ...})
plt.figure(figsize = (10,6))
fdist.plot(25, cumulative = false)
plt.show()
from nltk.corpus import stopwords
from string import punctuation
stop_words = stopwords.words('english')
print(stop_words)
print(list(punctuation))
stop_words_updated = stop_words + ['..', '...', 'will', 'would', 'can', 'could', "n't"]
print(stop_words_updated)
['got', 'gps', 'husband', 'otr', 'road', 'trucker', 'impressed', 'shipping', 'time', 'arrived',
'days', 'earlier', 'expected', 'within', 'week', 'use', 'however', 'started', 'freezing',
'glitch', 'unit', 'worked', 'great', 'worked', 'work', 'great', 'normal', 'person', 'well',
'trucker', 'option', 'big', 'truck', 'routes', 'tells', 'scale', 'coming', 'ect', 'love',
'bigger', 'screen', 'ease', 'use', 'ease', 'putting', 'addresses', 'memory', 'nothing', 'really', 'bad']
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
lemma_words = [lemma.lemmatize(i) for i in all_words_updated]
print(len(set(lemma_words)))
9038
def clean_text(text):
token = word_tokenize(text.lower())
lemm = [lemma.lemmatize(i) for i in token if i not in stop_words_updated\
and i not in list(punctuation) and len(i) > 2]
sentence = ' '.join(lemm)
return sentence
reviews['clean_reviewText'] = reviews['reviewText'].apply(clean_text)
reviews.head()
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range = (2,2))
bigrams = count_vect.fit_transform(reviews['clean_reviewText'])
print(count_vect.get_feature_names()[:100])
DTM = pd.DataFrame(bigrams.toarray(), columns = count_vect.get_feature_names())
DTM.head()
top_25_bigrams = DTM.sum().sort_values(ascending = false).head(25)
top_25_bigrams
top_25_bigrams.plot(kind = 'bar', figsize = (16,8))
plt.show()
bigrams = DTM.columns
print(bigrams)
Index(['00 100', '00 300', '00 believe', '00 best', '00 came', '00 dollar',
'00 free', '00 great', '00 hit', '00 hold',
...
'zoom touch', 'zoom type', 'zoom update', 'zoom use', 'zooming awkward',
'zooming feature', 'zooming scrolling', 'zs15 photo', 'zune place',
'zune ve'],
dtype='object', length=55110)
negative_words = ['poor', 'waste', 'bad', 'defective',
'disgusting', 'untrusty', 'worst',
'horrible', 'unexpectedly', 'slow']
negative_bigrams = []
for i in bigrams:
words = i.split()
if sum(np.in1d(words, negative_words)) >= 1:
negative_bigrams.append(i)
DTM_subset = DTM[negative_bigrams]
top_25_cutomer_concern_areas = DTM_subset.sum().sort_values(ascending = false).head(25)
top_25_cutomer_concern_areas
top_25_cutomer_concern_areas.plot(kind = 'bar', figsize = (16,8))