Analysis of table tennis products in Taobao

import pandas as pd
#Reference website
#https://mp.weixin.qq.com/s/ztm9-LoPic2etFDGB95jmQ
test=pd.read_excel('products_data.xls')
test.head()
Shop name Commodity information Selling price Number of payments Place of delivery
0 Kundome sports outdoor franchise store Sticha table tennis racket ebony 7 professional racket offensive pure wood ebony 5 Maple 7 stacca base plate ¥1048.00 38 party payment Shijiazhuang, Hebei
1 Milly sports store Butterfly table tennis racket Butterfly King Single racket professional 8-star carbon base plate 8-star table tennis horizontal racket authentic direct racket ¥478.00 114 party payment Shanghai
2 luciferchen123 Yinglian STIGA stikastica blue label Xu Xin carbon DYNASTY table tennis racket DYNASTY ¥1074.40 25 party payment Beijing
3 Shengnong sports store STIGA stika carbon Dynasty ¥1106.00 1 party payment Jinhua, Zhejiang
4 luciferchen123 Yinglian Hongshuangxi table tennis baseboard racket new version of dragon 5 x dragon 5 dragon 2 Dragon 3 three horse Dragon ¥797.00 58 party payment Beijing

Data preprocessing

test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062 entries, 0 to 1061
Data columns (total 5 columns):
Shop name 1062 non null object
 Product information 1062 non null object
 Sales price 1062 non null object
 Number of payers 1062 non null object
 Ship from 1062 non null object
dtypes: object(5)
memory usage: 41.6+ KB
test.isnull().sum()
Store name 0
 Product information 0
 Sales price 0
 Number of payers 0
 Ship from 0
dtype: int64
test['Selling price']=test['Selling price'].apply(lambda x: x.split('¥')[1])
test['Number of payments']=test['Number of payments'].apply(lambda x: x.split('people')[0])
test['Selling price']=test['Selling price'].astype('float')
test['Number of payments']=test['Number of payments'].astype('int')
import codecs
import jieba
import pickle
# test ['product information']. To 'CSV ('name. txt', sep='\t', index=False)
# fin = codecs.open('name. TXT ', mode =' R ', encoding =' UTF-8 ')
# # print (fin.read())
# #Save the divided words in the file when running the program for the first time
# text = ''
# with open('name. TXT ', encoding =' UTF-8 ') as fin:
#     for line in fin.readlines():
#         line = line.strip('\n')
#         text += ' '.join(jieba.cut(line))
#         text += ' '
# fout = open('text.txt','wb')
# pickle.dump(text,fout)
# fout.close()
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
fr = open('text.txt','rb')
text = pickle.load(fr)
# print(text)
backgroud_Image = plt.imread('table_tennis.jpg')
wc = WordCloud( background_color = 'white',    # Set background color
                mask = backgroud_Image,        # Set background picture
                max_words = 200,            # Set the maximum number of realistic words
                stopwords = STOPWORDS,        # Set stop words
                font_path = 'simfang.ttf',# Set font format, if not set, Chinese will not be displayed
                max_font_size = 200,            # Set font maximum
                random_state = 8,            # Set how many randomly generated states are there, that is, how many color schemes are there
                )
wc.generate(text)
image_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func = image_colors)
plt.figure(figsize=(10,10))
plt.imshow(wc)
plt.axis('off')
plt.show()

# 2. Convert region to province only
raw_location = test['Place of delivery'].values
new_location = []
for location in raw_location:
    if ' ' in location:
        location = location[:location.find(' ')]
    new_location.append(location)
# df.location is similar to df[location]
test['Place of delivery'] = new_location
print(test['Place of delivery'].values)
['Hebei' 'Shanghai' 'Beijing' ... 'Henan' 'Henan' 'Beijing']
test.head()
Shop name Commodity information Selling price Number of payments Place of delivery
0 Kundome sports outdoor franchise store Sticha table tennis racket ebony 7 professional racket offensive pure wood ebony 5 Maple 7 stacca base plate 1048.0 38 Hebei
1 Milly sports store Butterfly table tennis racket Butterfly King Single racket professional 8-star carbon base plate 8-star table tennis horizontal racket authentic direct racket 478.0 114 Shanghai
2 luciferchen123 Yinglian STIGA stikastica blue label Xu Xin carbon DYNASTY table tennis racket DYNASTY 1074.4 25 Beijing
3 Shengnong sports store STIGA stika carbon Dynasty 1106.0 1 Zhejiang
4 luciferchen123 Yinglian Hongshuangxi table tennis baseboard racket new version of dragon 5 x dragon 5 dragon 2 Dragon 3 three horse Dragon 797.0 58 Beijing

Analysis of the relationship between the high frequency key words of table tennis racket title and the quantity of commodities

import jieba.analyse
keywords_count_list = jieba.analyse.textrank(' '.join(test['Commodity information']), topK=50, withWeight=True)
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.890 seconds.
Prefix dict has been built succesfully.
keywords_count_dict = {i[0]: 0 for i in reversed(keywords_count_list[:20])}
cut_words = jieba.cut(' '.join(test['Commodity information']))
for word in cut_words:
    for keyword in keywords_count_dict.keys():
        if word == keyword:
            keywords_count_dict[keyword] = keywords_count_dict[keyword] + 1
print(keywords_count_dict)
{'Rose': 82, 'finished product': 67, 'Hurricane': 91, 'Juniper': 63, 'Arc circle': 90, 'Fast break': 89, 'Professional level': 93, 'major': 91, 'Offensive type': 103, 'The galaxy': 138, 'Pure wood': 144, 'Racket': 210, 'Butterfly': 219, 'penhold': 230, 'Shake-hands grip': 254, 'carbon': 337, 'Quality goods': 454, 'Table Tennis': 632, 'Table tennis bat': 793, 'floor': 1419}
from pyecharts.charts import Pie, Bar, Map, WordCloud
from pyecharts import options as opts
keywords_count_bar = (
        Bar()
            .add_xaxis(list(keywords_count_dict.keys()))
            .add_yaxis("", list(keywords_count_dict.values()))
            .reversal_axis()
            .set_series_opts(label_opts=opts.LabelOpts(position="right"))
            .set_global_opts(
            title_opts=opts.TitleOpts(title="Table tennis racket keywords TOP20"),
            yaxis_opts=opts.AxisOpts(name="Keyword"),
            xaxis_opts=opts.AxisOpts(name="frequency")
        )
    )
keywords_count_bar.render_notebook()

    <div id="a79de72ca6bf4395a81e892dd7bd0f15" style="width:900px; height:500px;"></div>

Analysis of the relationship between table tennis racket title and average sales

def analysis_title_keywords(keywords_count_list, column, top_num) -> dict:
    """
    //Analyze the relationship between title keywords and other attributes
    :param keywords_count_list: Keywords list
    :param column: Property name to analyze
    :param top_num: How many before interception
    :return:
    """
    # 1. Get high frequency words and generate a dict={'keyword1':[], 'keyword2': [],...}
    keywords_column_dict = {i[0]: [] for i in keywords_count_list}
    for row in test.iterrows():
        for keyword in keywords_column_dict.keys():
            if keyword in row[1]['Commodity information']:
                # 2. Put the attribute value of the title containing the keyword in the list, dict={'keyword1': [attribute value 1, attribute value 2,..]}
                keywords_column_dict[keyword].append(row[1][column])
    # 3. Find the average value of attribute value, dict={'keyword1': average value 1, 'keyword2', average value 2}
    for keyword in keywords_column_dict.keys():
        keyword_column_list = keywords_column_dict[keyword]
        keywords_column_dict[keyword] = sum(keyword_column_list) / len(keyword_column_list)
    # 4. Sort by average, from small to large
    keywords_price_dict = dict(sorted(keywords_column_dict.items(), key=lambda d: d[1]))
    # 5. Intercept the 20 keywords with the highest average value
    keywords_price_dict = {k: keywords_price_dict[k] for k in list(keywords_price_dict.keys())[-top_num:]}
#     print(keywords_price_dict)
    return keywords_price_dict
keywords_sales_dict = analysis_title_keywords(keywords_count_list, 'Number of payments', 20)
    # Generate histogram
keywords_sales_bar = (
    Bar()
        .add_xaxis(list(keywords_sales_dict.keys()))
        .add_yaxis("", list(keywords_sales_dict.values()))
        .reversal_axis()
        .set_series_opts(label_opts=opts.LabelOpts(position="right"))
        .set_global_opts(
        title_opts=opts.TitleOpts(title="Table tennis racket keywords TOP20"),
        yaxis_opts=opts.AxisOpts(name="Keyword"),
        xaxis_opts=opts.AxisOpts(name="Average sales volume")
    )
)
keywords_sales_bar.render_notebook()

    <div id="637cf0a54fbe47198db24d37829cb88d" style="width:900px; height:500px;"></div>

Analysis on the price distribution of table tennis racket floor

def cut_and_sort_data(listBins, listLabels, data_list) -> dict:
    """
    //Count the number of elements in the list, return the elements and count
    :param listBins: Data segmentation area
    :param listLabels: Corresponding label after segmentation
    :param data_list: Data list form
    :return: key As element value by count Of dict
    """
    data_labels_list = pd.cut(data_list, bins=listBins, labels=listLabels, include_lowest=True)
    # Generate a dictionary in listLabels order so that you don't need to reorder later
    data_count = {i: 0 for i in listLabels}
    # Statistical results
    for value in data_labels_list:
        # The get(value, num) function is used to get the key value corresponding to value in the dictionary. num=0 indicates the initial value size.
        data_count[value] = data_count.get(value) + 1
    return data_count
price_list_bins = [0, 100, 200, 300, 400, 500, 600, 700, 800,1000000]
# Set corresponding label after segmentation
price_list_labels = ['0-100', '100-200', '200-300', '300-400', '400-500', '500-600', '600-700', '700-800', '800 Above']
# Zoning statistics
price_count = cut_and_sort_data(price_list_bins, price_list_labels, test['Selling price'])
print(price_count)
# Generate histogram
bar = (
    Bar()
        .add_xaxis(list(price_count.keys()))
        .add_yaxis("", list(price_count.values()))
        .set_global_opts(
        title_opts=opts.TitleOpts(title="Table tennis racket base plate price interval distribution column"),
        yaxis_opts=opts.AxisOpts(name="A commodity"),
        xaxis_opts=opts.AxisOpts(name="Commodity price: Yuan")
    )
)
bar.render_notebook()
{'0-100': 132, '100-200': 144, '200-300': 126, '300-400': 103, '400-500': 91, '500-600': 66, '600-700': 63, '700-800': 22, '800 Above': 315}


    <div id="6c419d06e2ef4e3ba7c8f0875edeb484" style="width:900px; height:500px;"></div>
# Pie chart
age_count_list = [list(z) for z in zip(price_count.keys(), price_count.values())]
pie = (
    Pie()
        .add("", age_count_list)
        .set_global_opts(title_opts=opts.TitleOpts(title="Table tennis racket base price range pie chart"))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()
    <div id="8ebf790e6e944e62959bf5470be0d4a3" style="width:900px; height:500px;"></div>

Analysis on the national distribution of table tennis merchants

province_sales = test['Place of delivery'].value_counts()
province_sales_list = [list(item) for item in province_sales.items()]
print(province_sales_list)
# 1.1 generate heat map
province_sales_map = (
    Map()
        .add("Distribution of table tennis players in China", province_sales_list, "china")
        .set_global_opts(
        visualmap_opts=opts.VisualMapOpts(max_=1000),
    )
)
province_sales_map.render_notebook()
[['Beijing', 178], ['Hebei', 162], ['Shanghai', 152], ['Zhejiang', 141], ['Guangdong', 115], ['Jiangsu', 93], ['Henan', 70], ['Tianjin', 57], ['Hunan', 31], ['Japan', 25], ['Shandong', 18], ['Fujian', 6], ['Hubei', 5], ['overseas', 4], ['Sichuan', 3], ['Shaanxi', 1], ['Liaoning', 1]]
    <div id="372c40b6b1814216b67e45775d988387" style="width:900px; height:500px;"></div>
# 1.2 generate histogram
province_sales_bar = (
    Bar()
        .add_xaxis(province_sales.index.tolist())
        .add_yaxis("", province_sales.values.tolist(), category_gap="50%")
        .set_global_opts(
        title_opts=opts.TitleOpts(title="Table tennis business number area histogram"),
        yaxis_opts=opts.AxisOpts(name="Number of merchants"),
        xaxis_opts=opts.AxisOpts(name="region", axislabel_opts={"rotate": 90})
    )
)
province_sales_bar.render_notebook()

    <div id="2853305a414a4176b2f2e8ec08dacd10" style="width:900px; height:500px;"></div>
63 original articles published, 18 praised, 9490 visited
Private letter follow

Tags: Attribute Lambda encoding

Posted on Mon, 13 Jan 2020 05:07:12 -0800 by djdon11