pandas takes the mean value of repetition date, merges it and puts it back in dataframe

Reference resources: pandas finds out the duplicate rows, takes the mean and merges them

import pandas as pd
import numpy as np
import matplotlib as mpl
%matplotlib inline

from ggplot import *
theme_bw()

ggplot is very troublesome. It uses old pandas internally, such as sort. There is also a problem with date. If you want to solve the problem, you can only modify the py file manually.

import numpy as np

import pymongo,pandas as pd
from bson import ObjectId

import matplotlib as mb
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import plotnine as p9

from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

from dateutil import parser

from ggplot import *
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

Import data

pcfr = pd.read_excel('hair_dryer.xlsx')
df = pcfr
# Replace the brand name after '= ='
m = df[df['product_title']=='remington ac2015 t|studio salon collection pearl ceramic hair dryer, deep purple']
m = df

Emotion analysis function

When df ['review [date] = PD. To [datetime (df ['review [date]]) is in the middle, there will be a metaphysical error The whole table is not the same as the way classified by brand

def s_c_f(df):
    
    # Duplicate removal
    df.duplicated().value_counts() 
    
    # NaN remove
    df['review_body'].str.split(expand = True)
    
     # date format convert
    # Cut the watch with this
    # df['review_date'] = df.review_date.apply(lambda x:parser.parse(x))
    # This is for the whole watch
    df['review_date'] = pd.to_datetime(df['review_date'])
    
    #Set date to index
    df = df.set_index('review_date')
    
    ## sentiment analysis
    # func for polarity
    def sentiment_calc(text):
        try:
            return TextBlob(text).sentiment.polarity
        except:
            return None
        
    # func for subjectivity    
    def sentiment_calc_sub(text):
        try:
            return TextBlob(text).sentiment.subjectivity
        except:
            return None
        
    df['polarity'] = df['review_body'].apply(sentiment_calc)
    df['subjectivity'] = df['review_body'].apply(sentiment_calc_sub)
    
    return df

Drawing function

Only polarity

def drawfig_polarity(result):
    # In s < span style = "font weight: bold;" > s < span style = "font weight: bold;" > s < span style = "font weight: bold;" > F, set < span style = "font weight: bold;" > index to set the date as index, and you can find the annual average value in this way
    #m = result.groupby(result.index.year).mean()
    
    '''reset_index by default does not modify the DataFrame; 
    it returns a new DataFrame with the reset index. 
    If you want to modify the original, 
    use the inplace argument: df.reset_index(drop=True, inplace=True). 
    Alternatively, assign the result of reset_index by doing df = df.reset_index(drop=True).'''
    new = result.reset_index()
    
    plot = ggplot(aes(x='review_date', y='polarity'), data=new) + \
        geom_point() + \
        geom_line(color = 'blue') + \
        stat_smooth(span = 0.1)
    
    return plot

Draw all three

def drawfig_star(df):
    
    # In s < span style = "font weight: bold;" > s < span style = "font weight: bold;" > s < span style = "font weight: bold;" > F, set < span style = "font weight: bold;" > index to set the date as index, and you can find the annual average value in this way
    #d = result.groupby(result.index.year).mean()#In fact, the columns that are not numerical values are automatically deleted after the calculation
    
    
    '''reset_index by default does not modify the DataFrame; 
    it returns a new DataFrame with the reset index. 
    If you want to modify the original, 
    use the inplace argument: df.reset_index(drop=True, inplace=True). 
    Alternatively, assign the result of reset_index by doing df = df.reset_index(drop=True).'''

    #df = d.drop(['helpful_votes','total_votes','help_precentage'],axis=1)#Specify column deletion by column name, axis default 0 is row, = 1 is column
    
    # Unified scale
    #df['subjectivity'] = df['subjectivity'].map(lambda x: x*45*0.76*0.8*0.7*0.4*1.08)  
    #df['polarity'] = df['polarity'].map(lambda x: x*45*0.76*0.8*0.7)
    
    def normalize(data):
        return (data - data.mean()) / data.std()
    
#     df['subjectivity'] = df['subjectivity'].map(lambda x: normalize(x))  
    df['subjectivity'] = normalize(df['subjectivity'])
#     df['polarity'] = df['polarity'].map(lambda x: normalize(x))
    df['polarity'] = normalize(df['polarity'])
    df['star_rating'] = normalize(df['star_rating'])
    
    #ggplot
    df['x'] = df.index
    df = pd.melt(df, id_vars='x')

    plot = ggplot(aes(x='x', y='value', color='variable'), df) + \
             geom_point() + stat_smooth(span = 0.1) + geom_line()
    
    plot.make()
#     plot.fig.set_size_inches(30, 5, forward=True)
#     plot.fig.set_dpi(100)
#     plot.fig
           
           
    
    return plot
result = s_c_f(m)

Get rid of the unnecessary. A kind of There are percentage, percentage and percentage in the data washed by teammates. They reported mistakes for a long time at the beginning.

# In s < span style = "font weight: bold;" > s < span style = "font weight: bold;" > s < span style = "font weight: bold;" > F, set < span style = "font weight: bold;" > index to set the date as index, and you can find the annual average value in this way
# d = result.groupby(result.index.day).mean()#In fact, the columns that are not numerical values are automatically deleted after the calculation
#d
resultdf = result.drop(['review_body','product_title','helpful_votes','total_votes','help_precentage','vine','verified_purchase','review_headline'],axis=1)#Specify column deletion by column name, axis default 0 is row, = 1 is column
resultdf

Play: average by day

We have a lot of data. The number of pieces is different every day. I think there is only one piece every day. I've been thinking about how to take the average and put it back again for a long time. In fact, the method is very simple.

# raw_df=resultdf.groupby(resultdf.index.day).mean()
# raw_df = resultdf.groupby([resultdf.index.year,resultdf.index.month]).mean()
raw_df = resultdf.groupby([resultdf.index.year.rename('year'), resultdf.index.month.rename('month')]).mean()

At this time, we have completed the average of the duplicate dates. But how to deal with the previous year and month is another problem. It looks like this now.


Put them back first. Note that no matter the set ﹣ index() or reset ﹣ index() operation, it does not change itself, but needs to be caught by another variable.
At the same time, create a new column of day s.

raw_df = raw_df.reset_index()
raw_df['day']=1


A very beautiful function. Generates a date in the format date ﹣ time. But you have to have three columns. That's why we're going to start a new day.

from datetime import datetime
# df['review_date'] = df.apply(lambda row: datetime(row['year'], row['month']), axis=1)
raw_df['review_date'] = pd.to_datetime(raw_df[['year','month','day']])


A dispensable step.

raw_df.sort_index(ascending=False,inplace=True)

Throw away the useless ones.

raw_df = raw_df.drop(['year','month','day'], axis=1)

raw_df = raw_df.set_index('review_date')
raw_df


Our problem has been solved perfectly ~ we can start drawing

plot1 = drawfig_polarity(raw_df)
plot1.save(filename = 'new_plot')

#Set date to index
plot2 = drawfig_star(raw_df)
#ggsave(filename="new_plot2.jpg", width=20, height=4, units='in', plot=plot2)
#plot2.save(filename='new_plot2')

111 original articles published, praised 7, visited 4088
Private letter follow

Tags: Lambda

Posted on Wed, 11 Mar 2020 05:22:39 -0700 by paddyhaig