World's Best AI Learning Platform with profoundly Demanding Certification Programs
Designed by IITians, only for AI Learners.
Internship Partner

In Association with
In collaboration with



Designed by IITians, only for AI Learners.
Internship Partner
In Association with
In collaboration with
New to InsideAIML? Create an account
Employer? Create an account
Designed by IITians, only for AI Learners.
Internship Partner
In Association with
In collaboration with
Enter your email below and we will send a message to reset your password
Designed by IITians, only for AI Learners.
Internship Partner
In Association with
In collaboration with
By providing your contact details, you agree to our Terms of Use & Privacy Policy.
Already have an account? Sign In
Designed by IITians, only for AI Learners.
Internship Partner
In Association with
In collaboration with
By providing your contact details, you agree to our Terms of Use & Privacy Policy.
Already have an account? Sign In
Download our e-book of Introduction To Python
4.5 (1,292 Ratings)
589 Learners
Shashank Shanu
2 years ago
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
amazon_reviews = pd.read_csv('amazon_reviews.csv')
amazon_reviews.head()
amazon_reviews.shape
(999, 10)
reviews = pd.DataFrame()
reviews['reviewText'] = amazon_reviews['reviewText']
reviews['overall'] = amazon_reviews['overall']
reviews.head()
reviews.isnone().sum()
reviewText 2
overall 0
dtype: int64
sns.heatmap(reviews.isnone())
reviews = reviews[reviews['reviewText'].notnone()]
reviews.head()
reviews.shape
(997, 2)
combined_reviews = " ".join(reviews['reviewText'])
print(combined_reviews[:1200])
type(combined_reviews)
str
from wordcloud import WordCloud
word_cloud = WordCloud(width = 1000, height = 600, background_color = 'white', max_words = 150).generate(combined_reviews)
plt.figure(figsize = (12,6))
plt.imshow(word_cloud)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
list1 = []
for i in combined_reviews.split():
list1.append(i)
dictionary1 = {}
for j in list1:
dictionary1[j] = dictionary1.get(j,0)+1
series1 = pd.Series(dictionary1)
word_freq = pd.DataFrame(series1)
word_freq = word_freq.reset_index().rename(columns = {'index':'Words', 0:'Frequency'})
word_freq.head()
top_25_words = word_freq.sort_values(ascending = false, by = 'Frequency')
top_25_words.head(25)
last_25_words = word_freq.sort_values(ascending = false, by = 'Frequency')
last_25_words.tail(25)
from nltk.tokenize import word_tokenize
all_words = word_tokenize(combined_reviews.lower())
print(all_words[:200])
from nltk.probability import FreqDist
fdist = FreqDist(all_words)
fdist
FreqDist({'the': 7854, '.': 6677, ',': 5824, 'i': 4497, 'to': 4419, 'and': 4210, 'a': 3900, 'it': 3656, 'is': 2497, 'for': 2097, ...})
plt.figure(figsize = (10,6))
fdist.plot(25, cumulative = false)
plt.show()
from nltk.corpus import stopwords
from string import punctuation
stop_words = stopwords.words('english')
print(stop_words)
print(list(punctuation))
stop_words_updated = stop_words + ['..', '...', 'will', 'would', 'can', 'could', "n't"]
print(stop_words_updated)
['got', 'gps', 'husband', 'otr', 'road', 'trucker', 'impressed', 'shipping', 'time', 'arrived',
'days', 'earlier', 'expected', 'within', 'week', 'use', 'however', 'started', 'freezing',
'glitch', 'unit', 'worked', 'great', 'worked', 'work', 'great', 'normal', 'person', 'well',
'trucker', 'option', 'big', 'truck', 'routes', 'tells', 'scale', 'coming', 'ect', 'love',
'bigger', 'screen', 'ease', 'use', 'ease', 'putting', 'addresses', 'memory', 'nothing', 'really', 'bad']
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
lemma_words = [lemma.lemmatize(i) for i in all_words_updated]
print(len(set(lemma_words)))
9038
def clean_text(text):
token = word_tokenize(text.lower())
lemm = [lemma.lemmatize(i) for i in token if i not in stop_words_updated\
and i not in list(punctuation) and len(i) > 2]
sentence = ' '.join(lemm)
return sentence
reviews['clean_reviewText'] = reviews['reviewText'].apply(clean_text)
reviews.head()
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range = (2,2))
bigrams = count_vect.fit_transform(reviews['clean_reviewText'])
print(count_vect.get_feature_names()[:100])
DTM = pd.DataFrame(bigrams.toarray(), columns = count_vect.get_feature_names())
DTM.head()
top_25_bigrams = DTM.sum().sort_values(ascending = false).head(25)
top_25_bigrams
top_25_bigrams.plot(kind = 'bar', figsize = (16,8))
plt.show()
bigrams = DTM.columns
print(bigrams)
Index(['00 100', '00 300', '00 believe', '00 best', '00 came', '00 dollar',
'00 free', '00 great', '00 hit', '00 hold',
...
'zoom touch', 'zoom type', 'zoom update', 'zoom use', 'zooming awkward',
'zooming feature', 'zooming scrolling', 'zs15 photo', 'zune place',
'zune ve'],
dtype='object', length=55110)
negative_words = ['poor', 'waste', 'bad', 'defective',
'disgusting', 'untrusty', 'worst',
'horrible', 'unexpectedly', 'slow']
negative_bigrams = []
for i in bigrams:
words = i.split()
if sum(np.in1d(words, negative_words)) >= 1:
negative_bigrams.append(i)
DTM_subset = DTM[negative_bigrams]
top_25_cutomer_concern_areas = DTM_subset.sum().sort_values(ascending = false).head(25)
top_25_cutomer_concern_areas
top_25_cutomer_concern_areas.plot(kind = 'bar', figsize = (16,8))