import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns color = sns.color_palette()
python
12345data = pd.read_csv('Womens_Clothing.csv') # 查看数据结构 data
python
123 Unnamed: 0Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Name0076733NaNAbsolutely wonderful - silky and sexy and comf...410InitmatesIntimateIntimates11108034NaNLove this dress! it's sooo pretty. i happene...514GeneralDressesDresses22107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDresses33104950My favorite buy!I love, love, love this jumpsuit. it's fun, fl...510General PetiteBottomsPants4484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlouses....................................2348123481110434Great dress for many occasionsI was very happy to snag this dress at such a ...510General PetiteDressesDresses234822348286248Wish it was made of cottonIt reminds me of maternity clothes. soft, stre...310General PetiteTopsKnits2348323483110431Cute, but see throughThis fit well, but the top was very see throug...301General PetiteDressesDresses2348423484108428Very cute dress, perfect for summer parties an...I bought this dress for a wedding i have this ...312GeneralDressesDresses2348523485110452Please make more like this one!This dress in a lovely platinum is feminine an...5122General PetiteDressesDresses23486 rows × 11 columns
有上面结果可知:
该数据集包括23486行和10个特征变量。每行对应一个客户评论,并包含以下变量:
**服装ID:**整数分类变量,指的是要查看的特定作品。
**年龄:**评论者年龄的正整数变量。
**标题:**评论标题的字符串变量。
**评论文本:**评论正文的字符串变量。
**评分:**客户授予的产品评分的正序整数变量,从1最差,到5最佳。
**推荐的IND:**二进制变量,说明客户在推荐1的地方推荐产品,不推荐0的地方。
**积极的反馈计数:**积极的整数,记录发现该评论为积极的其他客户的数量。
**高级部门名称:**产品高级部门的分类名称。
**部门名称:**产品部门名称的分类名称。
**类名称:**产品类名称的分类名称。
中文名称英文名称
服装IDClothing ID
年龄Age
标题Title
评论文本Review Text
评分:Rating
推荐的INDRecommended IND
积极的反馈计数Positive Feedback Count
高级部门名称Division Name
部门名称Department Name
类名称Class Name
data.info()
python
1<class 'pandas.core.frame.DataFrame'> RangeIndex: 23486 entries, 0 to 23485 Data columns (total 11 columns): Unnamed: 0 23486 non-null int64 Clothing ID 23486 non-null int64 Age 23486 non-null int64 Title 19676 non-null object Review Text 22641 non-null object Rating 23486 non-null int64 Recommended IND 23486 non-null int64 Positive Feedback Count 23486 non-null int64 Division Name 23472 non-null object Department Name 23472 non-null object Class Name 23472 non-null object dtypes: int64(6), object(5) memory usage: 2.0+ MB
12345678910111213141516# 查看缺失值 # data.isnull() # 删除缺失值 df = data.dropna() df
python
12345 Unnamed: 0Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Name22107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDresses33104950My favorite buy!I love, love, love this jumpsuit. it's fun, fl...510General PetiteBottomsPants4484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlouses55108049Not for the very petiteI love tracy reese dresses, but this one is no...204GeneralDressesDresses6685839Cagrcoal shimmer funI aded this in my basket at hte last mintue to...511General PetiteTopsKnits....................................2348123481110434Great dress for many occasionsI was very happy to snag this dress at such a ...510General PetiteDressesDresses234822348286248Wish it was made of cottonIt reminds me of maternity clothes. soft, stre...310General PetiteTopsKnits2348323483110431Cute, but see throughThis fit well, but the top was very see throug...301General PetiteDressesDresses2348423484108428Very cute dress, perfect for summer parties an...I bought this dress for a wedding i have this ...312GeneralDressesDresses2348523485110452Please make more like this one!This dress in a lovely platinum is feminine an...5122General PetiteDressesDresses19662 rows × 11 columns
# 1. 可视化 给出评分者的年龄 plt.hist(df['Age'], color=color[1], label='age') plt.legend() plt.xlabel('age') plt.ylabel('count') plt.title('age of commentator') print('n figure 01')
python
1234567figure 01 1

由figure01 可得出:给出评论的人的年龄大多在25到45之间,青年、中年人较多
# 2. 可视化不同年龄的等级图 plt.figure(figsize=(10, 8)) sns.boxplot(x='Rating', y='Age', data=df) plt.title('age of rating') print('n figure 02')
python
12345figure 02 1

由figure02 可得出:给出评分分布的年龄都差不多
3、每个部门、推荐什么服装?
查看Division Name,Department Name和’Class Name的唯一值
print('高级部门Division Name', df['Division Name'].unique()) print() print('部门Department Name',df['Department Name'].unique()) print() print('类名称Class Name',df['Class Name'].unique())
python
12345高级部门Division Name ['General' 'General Petite' 'Initmates'] 部门Department Name ['Dresses' 'Bottoms' 'Tops' 'Intimate' 'Jackets' 'Trend'] 类名称Class Name ['Dresses' 'Pants' 'Blouses' 'Knits' 'Intimates' 'Outerwear' 'Lounge' 'Sweaters' 'Skirts' 'Fine gauge' 'Sleep' 'Jackets' 'Swim' 'Trend' 'Jeans' 'Shorts' 'Legwear' 'Layering' 'Casual bottoms' 'Chemises'] 1234567
将Recommended IND推荐产品为1,不推荐0的数据分开
# recommend not_recommend recommend = df[df['Recommended IND'] == 1] not_recommend = df[df['Recommended IND'] == 0] # recommend.head() not_recommend.head()
python
12345 Unnamed: 0Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Name22107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDresses55108049Not for the very petiteI love tracy reese dresses, but this one is no...204GeneralDressesDresses1010107753Dress looks like it's made of cheap materialDress runs small esp where the zipper area run...3014GeneralDressesDresses2222107731Not what it looks likeFirst of all, this is not pullover styling. th...207GeneralDressesDresses252569731Falls flatLoved the material, but i didnt really look at...300InitmatesIntimateLounge# 4.可视化不同部门的推荐和不推荐的叠加柱状图 plt.figure(figsize=(12,8)) plt.hist(recommend['Department Name'], color=color[2], alpha=0.5, label='recommend') plt.hist(not_recommend['Department Name'], color=color[4], alpha=0.5, label='not_recommend') plt.legend() plt.xticks(rotation=45) plt.title('Department recommend and not_recommend') print('n figure 03')
python
12345678figure 03 1

由figure03可知 绿色的面积大于X色的面积,由此说明,大部分部门都可以推荐商品
# 可视化不同商品的推荐和不推荐叠加柱状图 plt.figure(figsize=(12,8)) plt.hist(recommend['Class Name'], color=color[1], alpha=0.5, label='recommend') plt.hist(not_recommend['Class Name'], color=color[5], alpha=0.5, label='not_recommend') plt.legend() plt.xticks(rotation=45) plt.title('Class recommend and not_recommend') print('n figure 04')
python
12345678figure 04 1

从figure04看出:并不是卖最多的Knits商品推荐成功率最大
# 哪个年龄段的人对什么样的衣服发表什么样的评论 df['Review Length'] = df['Review Text'].astype(str).apply(len) df
python
123E:anacondalibsite-packagesipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy 12345
Unnamed: 0Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass NameReview Length22107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDresses50033104950My favorite buy!I love, love, love this jumpsuit. it's fun, fl...510General PetiteBottomsPants1244484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlouses19255108049Not for the very petiteI love tracy reese dresses, but this one is no...204GeneralDressesDresses4886685839Cagrcoal shimmer funI aded this in my basket at hte last mintue to...511General PetiteTopsKnits496.......................................2348123481110434Great dress for many occasionsI was very happy to snag this dress at such a ...510General PetiteDressesDresses131234822348286248Wish it was made of cottonIt reminds me of maternity clothes. soft, stre...310General PetiteTopsKnits2232348323483110431Cute, but see throughThis fit well, but the top was very see throug...301General PetiteDressesDresses2082348423484108428Very cute dress, perfect for summer parties an...I bought this dress for a wedding i have this ...312GeneralDressesDresses4272348523485110452Please make more like this one!This dress in a lovely platinum is feminine an...5122General PetiteDressesDresses11019662 rows × 12 columns
# 绘制单Review Length变量分布 # 单变量分布的最方便的方法是sns.distplot()功能。默认情况下,这将绘制直方图并拟合核密度估计(KDE) fig = plt.figure(figsize=(12, 8)) ax = sns.distplot(df['Review Length'], color=color[3]) ax = plt.title("Length of Reviews") print('n figure 05')
python
123456figure 05 1

由figure05可得出 大部分人评论的长度都基本在500
# 可视化不同年龄段的评论长度分布 plt.figure(figsize=(18,8)) sns.boxplot(x='Age', y='Review Length', data=df) print('n figure 06')
python
1234figure 06 1

# 评分与正面反馈计数 plt.figure(figsize=(12,8)) sns.boxplot(x = 'Rating', y = 'Positive Feedback Count', data = df) print('n figure 07')
python
1234figure 07 1

由图figure07可得出 评分在3以上的正面反馈的计数大
# 1. 数据清洗 import re from wordcloud import WordCloud, STOPWORDS def clean_data(text): letters_only = re.sub("[^a-zA-Z]", " ", text) # 替换标点符合等 words = letters_only.lower().split() return( " ".join( words )) # return letters_only stopwords= set(STOPWORDS)|{'skirt', 'blouse','dress','sweater', 'shirt','bottom', 'pant', 'pants' 'jean', 'jeans','jacket', 'top', 'dresse'} def create_cloud(rating): x= [i for i in rating] y= ' '.join(x) cloud = WordCloud(background_color='white',width=1600, height=800,max_words=100,stopwords= stopwords).generate(y) plt.figure(figsize=(15,7.5)) plt.axis('off') plt.imshow(cloud) plt.show()
python
123456789101112131415161718192021# 等级是5的词云图 rating5= df[df['Rating']==5]['Review Text'].apply(clean_data) create_cloud(rating5)
python
123
# 等级是4的词云图 rating4= df[df['Rating']==4]['Review Text'].apply(clean_data) create_cloud(rating4)
python
123
# 等级是3的词云图 rating3= df[df['Rating']==3]['Review Text'].apply(clean_data) create_cloud(rating3)
python
123
# 等级是2的词云图 rating2= df[df['Rating']==2]['Review Text'].apply(clean_data) create_cloud(rating2)
python
123
# 等级是1的词云图 rating1= df[df['Rating']==1]['Review Text'].apply(clean_data) create_cloud(rating1)
python
123
相关知识
2020年 花卉电商零售市场数据分析
16点聊电商:情人节淘宝数据:鲜花消费人群中60%为女性
电商平台消费行为数据分析.pdf
电商市场:花卉产品消费数据运营分析【多平台数据整合】
宠物市场新动态:电商数据可视化分析揭示细分需求增长
通过一个简单的电商零售数据集,了解数据分析流程
“订阅制+电商”模式,用数据重构个性化消费场景 – 人人都是产品经理,
电商技术揭秘十五:数据挖掘与用户行为分析
电商如何整合跨平台数据?这份电商数据化建设方案一学就会!
电商数据分析8——电商平台社交媒体营销的数据分析策略
网址: 女性服装数据分析(电商数据)版本1 https://m.huajiangbk.com/newsview2495497.html
| 上一篇: 批发、零售、直播、外卖、快递量身 |
下一篇: 电商场景:并发扣库存,怎么保证不 |