Airline Quality Service Analysis¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
!pip install pydrive
Collecting pydrive Downloading https://files.pythonhosted.org/packages/52/e0/0e64788e5dd58ce2d6934549676243dc69d982f198524be9b99e9c2a4fd5/PyDrive-1.3.1.tar.gz (987kB) Collecting google-api-python-client>=1.2 (from pydrive) Downloading https://files.pythonhosted.org/packages/3f/f1/20fd18744c3d20307d634ffcc02592bc7efc45a59624e14655cf21cbfb5e/google_api_python_client-1.7.9-py3-none-any.whl (56kB) Collecting oauth2client>=4.0.0 (from pydrive) Downloading https://files.pythonhosted.org/packages/95/a9/4f25a14d23f0786b64875b91784607c2277eff25d48f915e39ff0cff505a/oauth2client-4.1.3-py2.py3-none-any.whl (98kB) Requirement already satisfied: PyYAML>=3.0 in c:\users\karan\appdata\local\programs\python\python36\lib\site-packages (from pydrive) (3.13) Collecting google-auth-httplib2>=0.0.3 (from google-api-python-client>=1.2->pydrive) Downloading https://files.pythonhosted.org/packages/33/49/c814d6d438b823441552198f096fcd0377fd6c88714dbed34f1d3c8c4389/google_auth_httplib2-0.0.3-py2.py3-none-any.whl Collecting google-auth>=1.4.1 (from google-api-python-client>=1.2->pydrive) Downloading https://files.pythonhosted.org/packages/c5/9b/ed0516cc1f7609fb0217e3057ff4f0f9f3e3ce79a369c6af4a6c5ca25664/google_auth-1.6.3-py2.py3-none-any.whl (73kB) Requirement already satisfied: httplib2<1dev,>=0.9.2 in c:\users\karan\appdata\local\programs\python\python36\lib\site-packages (from google-api-python-client>=1.2->pydrive) (0.12.1) Collecting uritemplate<4dev,>=3.0.0 (from google-api-python-client>=1.2->pydrive) Downloading https://files.pythonhosted.org/packages/e5/7d/9d5a640c4f8bf2c8b1afc015e9a9d8de32e13c9016dcc4b0ec03481fb396/uritemplate-3.0.0-py2.py3-none-any.whl Requirement already satisfied: six<2dev,>=1.6.1 in c:\users\karan\appdata\local\programs\python\python36\lib\site-packages (from google-api-python-client>=1.2->pydrive) (1.12.0) Collecting pyasn1>=0.1.7 (from oauth2client>=4.0.0->pydrive) Downloading https://files.pythonhosted.org/packages/7b/7c/c9386b82a25115cccf1903441bba3cbadcfae7b678a20167347fa8ded34c/pyasn1-0.4.5-py2.py3-none-any.whl (73kB) Collecting rsa>=3.1.4 (from oauth2client>=4.0.0->pydrive) Downloading https://files.pythonhosted.org/packages/02/e5/38518af393f7c214357079ce67a317307936896e961e35450b70fad2a9cf/rsa-4.0-py2.py3-none-any.whl Collecting pyasn1-modules>=0.0.5 (from oauth2client>=4.0.0->pydrive) Downloading https://files.pythonhosted.org/packages/91/f0/b03e00ce9fddf4827c42df1c3ce10c74eadebfb706231e8d6d1c356a4062/pyasn1_modules-0.2.5-py2.py3-none-any.whl (74kB) Collecting cachetools>=2.0.0 (from google-auth>=1.4.1->google-api-python-client>=1.2->pydrive) Downloading https://files.pythonhosted.org/packages/2f/a6/30b0a0bef12283e83e58c1d6e7b5aabc7acfc4110df81a4471655d33e704/cachetools-3.1.1-py2.py3-none-any.whl Building wheels for collected packages: pydrive Building wheel for pydrive (setup.py): started Building wheel for pydrive (setup.py): finished with status 'done' Stored in directory: C:\Users\Karan\AppData\Local\pip\Cache\wheels\fa\d2\9a\d3b6b506c2da98289e5d417215ce34b696db856643bad779f4 Successfully built pydrive Installing collected packages: pyasn1, pyasn1-modules, rsa, cachetools, google-auth, google-auth-httplib2, uritemplate, google-api-python-client, oauth2client, pydrive Successfully installed cachetools-3.1.1 google-api-python-client-1.7.9 google-auth-1.6.3 google-auth-httplib2-0.0.3 oauth2client-4.1.3 pyasn1-0.4.5 pyasn1-modules-0.2.5 pydrive-1.3.1 rsa-4.0 uritemplate-3.0.0
You are using pip version 19.0.3, however version 19.1.1 is available. You should consider upgrading via the 'python -m pip install --upgrade pip' command.
In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-3-04913854c332> in <module> 1 from pydrive.auth import GoogleAuth 2 from pydrive.drive import GoogleDrive ----> 3 from google.colab import auth 4 from oauth2client.client import GoogleCredentials 5 # Authenticate and create the PyDrive client. ModuleNotFoundError: No module named 'google.colab'
In [ ]:
link='https://drive.google.com/open?id=1tmzZKQKEvxt61TxjHchFfJkpqklVgdzP'
fluff,id=link.split('=')
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('airline.csv')
airline_data = pd.read_csv('airline.csv')
In [5]:
airline_data.head(1)
Out[5]:
airline_name | link | title | author | author_country | date | content | aircraft | type_traveller | cabin_flown | route | overall_rating | seat_comfort_rating | cabin_staff_rating | food_beverages_rating | inflight_entertainment_rating | ground_service_rating | wifi_connectivity_rating | value_money_rating | recommended | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | adria-airways | /airline-reviews/adria-airways | Adria Airways customer review | D Ito | Germany | 2015-04-10 | Outbound flight FRA/PRN A319. 2 hours 10 min f… | NaN | NaN | Economy | NaN | 7.0 | 4.0 | 4.0 | 4.0 | 0.0 | NaN | NaN | 4.0 | 1 |
Analysing the DataSet¶
In [6]:
round((airline_data.isna().sum())/len(airline_data),2)
Out[6]:
airline_name 0.00 link 0.00 title 0.00 author 0.00 author_country 0.04 date 0.00 content 0.00 aircraft 0.97 type_traveller 0.94 cabin_flown 0.07 route 0.94 overall_rating 0.11 seat_comfort_rating 0.19 cabin_staff_rating 0.19 food_beverages_rating 0.20 inflight_entertainment_rating 0.25 ground_service_rating 0.95 wifi_connectivity_rating 0.99 value_money_rating 0.04 recommended 0.00 dtype: float64
In [ ]:
airline_data=airline_data[airline_data['overall_rating'].notnull()]
In [8]:
round((airline_data.isna().sum())/len(airline_data),2)
Out[8]:
airline_name 0.00 link 0.00 title 0.00 author 0.00 author_country 0.02 date 0.00 content 0.00 aircraft 0.97 type_traveller 0.94 cabin_flown 0.05 route 0.94 overall_rating 0.00 seat_comfort_rating 0.17 cabin_staff_rating 0.17 food_beverages_rating 0.18 inflight_entertainment_rating 0.23 ground_service_rating 0.94 wifi_connectivity_rating 0.98 value_money_rating 0.03 recommended 0.00 dtype: float64
Calcluating Total Reviews to Each airline¶
In [9]:
airline_names=airline_data.airline_name.unique()
print('Total airlines Considered for Analysis : ',len(airline_names))
total_reviews_each_airline=[]
for i in airline_names:
temp=airline_data[airline_data.airline_name==i]
total_reviews_each_airline.append(len(temp))
result=list(zip(airline_names,total_reviews_each_airline))
print('Total Reviews Analysed : ',sum(total_reviews_each_airline))
Total airlines Considered for Analysis : 357 Total Reviews Analysed : 36861
In [10]:
df=pd.DataFrame(result,columns=['Airline_Name','Total_Reviews'])
df=df.sort_values(by='Total_Reviews',ascending=False)
df.head()
Out[10]:
Airline_Name | Total_Reviews | |
---|---|---|
295 | spirit-airlines | 966 |
97 | british-airways | 896 |
333 | united-airlines | 839 |
20 | air-canada-rouge | 715 |
138 | emirates | 690 |
In [ ]:
In [11]:
plt.style.use('seaborn')
plt.xlabel('Airlines')
plt.ylabel('Reviews')
plt.bar(df.Airline_Name[:5],df.Total_Reviews[:5],label='Top 5 Airlines')
plt.legend()
Out[11]:
<matplotlib.legend.Legend at 0x7f1609e78ac8>
Finding Polarity of Each Review¶
In [ ]:
from textblob import TextBlob
In [ ]:
result1=[]
num=1
numlist=[]
for review in airline_data.content:
analysis=TextBlob(review)
result1.append(analysis.polarity)
numlist.append(num)
num=num+1
result1=np.array(result1)
In [ ]:
Plotting Polarity¶
In [16]:
plt.style.use('seaborn')
plt.scatter(numlist,result1,label='Polarity')
plt.xlabel('Reviews')
plt.ylabel('Polarity')
plt.legend()
Out[16]:
<matplotlib.legend.Legend at 0x7f15ffe53e10>
Generating Word Cloud¶
In [ ]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text=''
for i in airline_data.content:
text=text+i+' '
In [ ]:
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [ ]:
from PIL import Image
import requests
from io import BytesIO
response = requests.get("http://www.pngmart.com/files/7/Modern-Plane-PNG-HD.png")
mask = np.array(Image.open(BytesIO(response.content)))
wordcloud_fra = WordCloud(background_color="white", mode="RGBA", max_words=1000, mask=mask).generate(text)
# create coloring from image
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[16,16])
plt.imshow(wordcloud_fra.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
Out[ ]:
(-0.5, 2718.5, 944.5, -0.5)
In [ ]:
Providing Sentiment Value to Each Review According to Rating¶
In [ ]:
sentiment=[]
for i in airline_data.overall_rating:
if(i<=5 and i>=0):
sentiment.append(0)
else:
sentiment.append(1)
Sentiment from polarity¶
In [ ]:
sentiment=[]
for i in result1:
if (i>0):
sentiment.append(1)
elif (i<0):
sentiment.append(-1)
elif (i==0):
sentiment.append(0)
In [ ]:
new_airline_data=airline_data.copy()
In [ ]:
new_airline_data['sentiment']=sentiment
In [ ]:
### Shuffline the Dataset for Training ###
from sklearn.utils import shuffle
new_airline_data=shuffle(new_airline_data)
In [94]:
positive_sentiment_count=new_airline_data[new_airline_data['sentiment']==1]
negative_sentiment_count=new_airline_data[new_airline_data['sentiment']==-1]
neutral_sentiment_count=new_airline_data[new_airline_data['sentiment']==0]
temp_array=[len(positive_sentiment_count),len(negative_sentiment_count),len(neutral_sentiment_count)]
x_axis_labels=['Positive_sentiment','Negative_sentiment','Neutral_sentiment']
plt.bar(x_axis_labels,temp_array,color=('green','red','blue'),width=(0.2,0.2,0.2))
Out[94]:
<BarContainer object of 3 artists>
Logisitc Regression¶
In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
In [ ]:
model=LogisticRegression()
vectorizer=CountVectorizer(ngram_range=(1,2))
x_l=vectorizer.fit_transform(new_airline_data.content.values)
In [23]:
model.fit(x_l[:29488],new_airline_data.sentiment[:29488].values)
/usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) /usr/local/lib/python3.6/dist-packages/sklearn/linear_model/logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning. "this warning.", FutureWarning)
Out[23]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
In [24]:
model.score(x_l[29488:],new_airline_data.sentiment[29488:].values)
Out[24]:
0.9178082191780822
In [106]:
new_airline_data.content[0]
Out[106]:
"Outbound flight FRA/PRN A319. 2 hours 10 min flight. I thought drinks/snacks for sale but sandwich soft drinks were served complimentary. Inbound flights SKP/LJU/FRA CRJ900. each 1 hour 30 min flight. Skyshop menu was in a seat pocket and drinks/snacks were for sale. All flight crews were friendly. Security check at the Ljubljana airport for transit passengers was chaos however it's possible to go to a gate within 30min."
In [ ]:
y_predict=model.predict(x_l[29488:])
In [ ]:
x_axis=[]
n=1
for i in range(0,len(new_airline_data[29488:].values)):
x_axis.append(n)
n=n+1
In [ ]:
In [28]:
plt.scatter(x_axis,new_airline_data.sentiment[29488:],color='red',label='predicted')
plt.scatter(x_axis,y_predict,label='actual')
plt.legend(loc='best')
Out[28]:
<matplotlib.legend.Legend at 0x7f15ffe95320>
Linear Support Vector Machine¶
In [ ]:
from sklearn.svm import LinearSVC
In [ ]:
clf = LinearSVC(random_state=42, tol=1e-5)
In [31]:
clf.fit(x_l[:29488],new_airline_data.sentiment[:29488])
/usr/local/lib/python3.6/dist-packages/sklearn/svm/base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
Out[31]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=42, tol=1e-05, verbose=0)
In [32]:
clf.score(x_l[29488:],new_airline_data.sentiment[29488:])
Out[32]:
0.9152312491523125
In [ ]:
y_predict1=clf.predict(x_l[29488:])
In [71]:
plt.scatter(x_axis,new_airline_data.sentiment[29488:],color='red',label='predicted')
plt.scatter(x_axis,y_predict1,label='actual')
plt.legend()
Out[71]:
<matplotlib.legend.Legend at 0x7f15ec83fb70>
**Preprocessing the Reviews **¶
In [ ]:
t=new_airline_data.copy()
In [ ]:
pos_sentiment=[]
neg_sentiment=[]
net_sentiment=[]
avg_rating=[]
for i in airline_names:
tempdf=t[t['airline_name']==i]
pos=len(tempdf[tempdf['sentiment']==1])
neg=len(tempdf[tempdf['sentiment']==-1])
net=len(tempdf[tempdf['sentiment']==0])
pos_sentiment.append(pos)
neg_sentiment.append(neg)
net_sentiment.append(net)
avg_rate=tempdf.overall_rating.mean()
avg_rating.append(avg_rate)
clustered_data=pd.DataFrame(list(zip(airline_names,avg_rating,pos_sentiment,net_sentiment,neg_sentiment)),columns=['airline_name','average_rating','pos_sentiment','net_sentiment','neg_sentiment'])
In [ ]:
clustered_data=clustered_data.sort_values(by=['pos_sentiment','average_rating'],kind='mergesort',ascending=False)
In [39]:
clustered_data.head()
Out[39]:
airline_name | average_rating | pos_sentiment | net_sentiment | neg_sentiment | |
---|---|---|---|---|---|
97 | british-airways | 5.881696 | 703 | 2 | 191 |
138 | emirates | 6.246377 | 558 | 1 | 131 |
295 | spirit-airlines | 2.902692 | 556 | 15 | 395 |
333 | united-airlines | 3.356377 | 523 | 5 | 311 |
215 | lufthansa | 6.993333 | 511 | 1 | 88 |
In [47]:
plt.figure(figsize=(26,26))
plt.bar(clustered_data.airline_name.head(100).values,clustered_data.pos_sentiment.head(100).values,label='Positive Sentiment',color='green')
plt.bar(clustered_data.airline_name.head(100).values,clustered_data.neg_sentiment.head(100).values,label='Negative Sentiment',color='red')
plt.bar(clustered_data.airline_name.head(100).values,clustered_data.net_sentiment.head(100).values,label='Neutral Sentiment',color='black')
plt.xticks(rotation=90)
plt.legend()
Out[47]:
<matplotlib.legend.Legend at 0x7f15f7f3bfd0>