好久没更新博客,这回来分析点数据~
收集数据
最开始只是好奇有多少高手买了supporter(一个月$4/¥30,是不能让你变强的VIP,只带有一点小特权,跟捐赠差不多),反正既然要爬数据,不如顺带分析下其它东西。
话说回来,正确的操作方式应该是使用osu api v2 ,但是因为我这请求量不大,就直接从网页扒数据了。爬虫本身比较简单,没什么需要讲的,我就直接放出代码了,请勿滥用。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 import jsonimport randomimport timeimport requestsimport csvimport loggingfrom bs4 import BeautifulSouplogging.basicConfig( level = logging.INFO, format = '%(asctime)s %(levelname)s %(message)s' , datefmt = '%Y-%m-%dT%H:%M:%S' ) def random_sleep (): seconds = random.randint(1 , 3 ) logging.info(f"Sleep for {seconds} s" ) time.sleep(seconds) def clean_text (text ): return text.replace(" " , "" ).replace("\n" , "" ).replace("," , "" ) mode_name = ["mania" ] headers = {'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } start_page = 1 end_page = 20 fieldnames = ['uid' , 'rank' , 'username' , 'country_code' , 'country_name' , 'accuracy' , 'play_count' , 'main_keymode' , 'performance' , 'pp_4k' , 'pp_7k' , 'SS_count' , 'S_count' , 'A_count' , 'is_supporter' , 'has_supported' , 'is_active' , 'name_change_count' , 'beatmap_playcount' , 'comments_count' , 'friend_count' , 'follower_count' , 'favourite_count' , 'graveyard_count' , 'ranked_count' , 'loved_count' , 'guest_count' , 'level' , 'ranked_score' , 'play_time' , 'total_score' , 'total_hits' , 'maximum_combo' , 'replays_count' ] with open ('data.csv' , 'w' , newline='' ) as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for mode in mode_name: for pg in range (start_page, end_page + 1 ): logging.info(f"Getting page {pg} for mode {mode} " ) data_lst = [] url = f"https://osu.ppy.sh/rankings/{mode} /performance" payload = {"page" : f"{pg} " } r = requests.get(url=url, params=payload, headers=headers) if r.status_code != 200 : raise requests.exceptions.HTTPError("The response code is not 200. Something's wrong!" ) webdata = r.text soup = BeautifulSoup(webdata,"lxml" ) uid_list = soup.find_all("a" , class_ = "ranking-page-table__user-link-text js-usercard" ) stat_list = soup.find_all("td" , class_ = "ranking-page-table__column" ) for idx in range (len (uid_list)): uid = uid_list[idx]['data-user-id' ] rank = clean_text(stat_list[8 *idx].text).replace("#" , "" ) username = clean_text(stat_list[8 *idx + 1 ].text) accuracy = clean_text(stat_list[8 *idx + 2 ].text) play_count = clean_text(stat_list[8 *idx + 3 ].text) performance = clean_text(stat_list[8 *idx + 4 ].text) SS_count = clean_text(stat_list[8 *idx + 5 ].text) S_count = clean_text(stat_list[8 *idx + 6 ].text) A_count = clean_text(stat_list[8 *idx + 7 ].text) info_url = f"https://osu.ppy.sh/users/{uid} /mania" info_resp = requests.get(url=info_url, headers=headers) if info_resp.status_code != 200 : raise requests.exceptions.HTTPError("The response code is not 200. Something's wrong!" ) info_text = info_resp.text info_soup = BeautifulSoup(info_text,"lxml" ) info_raw = info_soup.find("div" , class_ = "js-react--profile-page osu-layout osu-layout--full" )['data-initial-data' ] info_data = json.loads(info_raw) country_code = info_data["user" ]["country" ]["code" ] country_name = info_data["user" ]["country" ]["name" ] is_supporter = info_data["user" ]["is_supporter" ] has_supported = info_data["user" ]["has_supported" ] is_active = info_data["user" ]["is_active" ] name_change_count = len (info_data["user" ]["previous_usernames" ]) beatmap_playcount = info_data["user" ]["beatmap_playcounts_count" ] comments_count = info_data["user" ]["comments_count" ] friend_count = info_data["user" ]["follower_count" ] follower_count = info_data["user" ]["mapping_follower_count" ] favourite_count = info_data["user" ]["favourite_beatmapset_count" ] graveyard_count = info_data["user" ]["graveyard_beatmapset_count" ] ranked_count = info_data["user" ]["ranked_beatmapset_count" ] loved_count = info_data["user" ]["loved_beatmapset_count" ] guest_count = info_data["user" ]["guest_beatmapset_count" ] level = info_data["user" ]["statistics" ]["level" ]["current" ] ranked_score = info_data["user" ]["statistics" ]["ranked_score" ] play_time = info_data["user" ]["statistics" ]["play_time" ] total_score = info_data["user" ]["statistics" ]["total_score" ] total_hits = info_data["user" ]["statistics" ]["total_hits" ] maximum_combo = info_data["user" ]["statistics" ]["maximum_combo" ] replays_count = info_data["user" ]["statistics" ]["replays_watched_by_others" ] try : pp_4k = int (info_data["user" ]["statistics" ]["variants" ][0 ]["pp" ]) pp_7k = int (info_data["user" ]["statistics" ]["variants" ][1 ]["pp" ]) if pp_4k > pp_7k: main_keymode = "4k" else : main_keymode = "7k" except : pp_4k = pp_7k = main_keymode = "N/A" player_pkg = {} for item in fieldnames: player_pkg[item] = eval (item) data_lst.append(player_pkg) logging.info(f"#{rank} {username} {uid} done!" ) random_sleep() with open ('data.csv' , 'a' , newline='' ) as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) for data in data_lst: writer.writerow(data)
分析数据
数据到手了,接下来用pandas和matplotlib来做一些可视化分析~
0x00 - 国家和地区
稍微看看高手们都来自哪里吧~
先导入一些必要的库
1 2 3 4 5 6 7 import randomimport matplotlibimport matplotlib.pyplot as pltimport matplotlib.colors as mcolorsimport pandas as pdimport numpy as npfrom cycler import cycler
接下来从data.csv
读取数据
1 2 df = pd.read_csv('data.csv' ) df
1 2 3 4 5 6 7 8 # 篇幅所限,这里只展示第一行部分数据 uid rank username country_code country_name accuracy \ 0 758406 1 dressurf KR South Korea 97.92% play_count main_keymode performance pp_4k ... ranked_count \ 0 17483 7k 23142 7526 ... 0 [1000 rows x 34 columns]
然后简单构建一下dataframe
1 2 3 df_draw = df.groupby('country_code' ).size().to_frame(name='count' ) df_draw = df_draw.sort_values('count' , ascending=False ).reset_index() df_draw
1 2 3 4 5 6 7 8 # 篇幅所限,只展示部分数据 country_code count 0 KR 274 1 CN 109 ... 55 EC 1 56 IN 1
但是如果直接用df_draw
画饼图(pie chart)会有个问题。从上面数据得知,大部分国家地区的高手数量少于20人,这样会导致饼图被切割得很细,不仅难看,文字也会重叠在一起。所以这里需要进一步处理下数据,把少于20人的国家地区全部归类到其它(Other)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 def shift_row_to_bottom (df, index_to_shift ): """Shift row, given by index_to_shift, to bottom of df.""" idx = df.index.tolist() idx.pop(index_to_shift) df = df.reindex(idx + [index_to_shift]) return df def my_autopct (pct ): return ('%1.1f%%' % pct) if pct > 4 else '' df_draw.loc[df_draw['count' ] < 20 , 'country_code' ] = 'Other' df_draw = df_draw.groupby('country_code' )['count' ].sum ().reset_index() df_draw = df_draw.sort_values('count' , ascending=False , ignore_index=True ) df_draw = shift_row_to_bottom(df_draw, 1 ) cm = plt.get_cmap('Set3' ) matplotlib.rcParams["axes.prop_cycle" ] = cycler( color=[cm(v) for v in np.linspace(0 , 1 , len (df_draw))] ) plt.pie(df_draw['count' ], labels=df_draw['country_code' ], autopct=my_autopct, startangle=140 ) plt.title("osu!mania top #1000 country code" ) plt.show()
很明显,中日韩美加起来就占据了半壁江山,紧随其后的是菲律宾,印尼,英国,泰国,马来西亚。
0x01 - 4k vs 7k
众所周知,7k的pp上限比4k要高得多,但是感觉前1000名还是有相当多的4k巨佬,这就来看看到底有多少。
简单构建一下dataframe
1 2 3 df_draw = df.groupby('main_keymode' ).size().to_frame(name='count' ) df_draw = df_draw.sort_values('count' , ascending=False ).reset_index() print (df_draw)
1 2 3 main_keymode count 0 7k 631 1 4k 369
简单画个环图(donut chart)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 df_draw = df_draw.groupby('main_keymode' )['count' ].sum ().reset_index() df_draw = df_draw.sort_values('count' , ascending=False , ignore_index=True ) cm = plt.get_cmap('Set3' ) matplotlib.rcParams["axes.prop_cycle" ] = cycler( color=[cm(v) for v in np.linspace(0 , 1 , len (df_draw))] ) explode = [0.05 , 0.05 ] plt.pie(df_draw['count' ], labels=df_draw['main_keymode' ], autopct='%1.1f%%' , startangle=140 , explode=explode, pctdistance=0.85 ) plt.title("osu!mania top #1000 main keymode" ) centre_circle = plt.Circle((0 , 0 ), 0.70 , fc='white' ) fig = plt.gcf() fig.gca().add_artist(centre_circle) plt.show()
现在三位数门槛已经来到了10000pp,纯4k想刷10000pp的话要gamma+的实力,只能说佩服佩服
0x02 - osu!supporter
让我看看有多少铁公鸡👀
每个玩家有两个boolean值,分别是has_supported
和is_supporter
,简单groupby
一下就能算出“从未”、“曾经”、“现在”三种状态的人数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 df_draw = df.groupby(['has_supported' , 'is_supporter' ]).size().to_frame(name='count' ) status = ['never' , 'was_supporter' , 'is_supporter' ] df_draw['status' ] = status cm = plt.get_cmap('Set3' ) matplotlib.rcParams["axes.prop_cycle" ] = cycler( color=[cm(v) for v in np.linspace(0 , 1 , len (df_draw))] ) explode = [0.1 , 0 , 0 ] plt.pie(df_draw['count' ], labels=df_draw['status' ], autopct='%1.1f%%' , startangle=140 , explode=explode, pctdistance=0.85 ) plt.title("osu!mania top #1000 supporter" ) centre_circle = plt.Circle((0 , 0 ), 0.70 , fc='white' ) fig = plt.gcf() fig.gca().add_artist(centre_circle) plt.show()
这么看来,绝大部分高手都买过至少一次osu!supporter。我曾经也买过一年,但supporter确实没什么用,就没再续费了。
来看看铁公鸡头目👀
1 df.loc[:, 'rank' :'country_code' ].loc[df['has_supported' ] == False ]
rank
username
country_code
4
Bssd
KR
7
karcice
KR
25
DellyK
KR
57
RaffCo
ID
64
God-
KR
0x03 - 改名次数
osu跟别的游戏不太一样,没法随意改id,修改次数越多就越贵,价格表如下所示。如果买了supporter,那么第一次改名是免费的。
Changes
Price
1
US$4
2
US$8
3
US$16
4
US$32
5
US$64
6+
US$100
那么来看看大家都改了几次id吧~
1 2 3 4 5 6 7 8 9 10 df_draw = df.groupby('name_change_count' ).size().to_frame(name='count' ).reset_index() fig, ax = plt.subplots() bars = ax.bar(df_draw['name_change_count' ], df_draw['count' ]) ax.bar_label(bars) ax.set_title("osu!mania top #1000 player name change" ) ax.set_xlabel('# Name Change' ) ax.set_ylabel('Player Count' ) plt.show()
居然有人改了六次id,让我看看是哪个土豪。。。
1 df.loc[:, 'rank' :'country_code' ].loc[df['name_change_count' ] == 6 ]
rank
username
country_code
647
Lovelyn
FI
改id花两百多刀确实挺离谱的,但是std还有个叫anna apple 的玩家更离谱,豪掷$1300+改了17次名,什么活菩萨💸
这下ppy躺着数钱了,看看高手们给他贡献了多少钱
1 2 3 4 df_draw = df.groupby('name_change_count' ).size().to_frame(name='count' ).reset_index() df_draw['cost' ] = [0 , 4 , 12 , 28 , 60 , 224 ] df_draw['ppy_laugh' ] = df_draw['cost' ] * df_draw['count' ] df_draw['ppy_laugh' ].sum ()
这只是mania前1000的玩家的收入。。。osu四模式这么多玩家,每天都有人会改名吧,跟包租婆差不多💦
0x04 - 键盘毁灭者
osu个人主页有个总命中次数(Total Hits),也就是键盘敲击次数
1 2 df_draw = df.sort_values('total_hits' , ascending=False , ignore_index=True ) df_draw.loc[:, ['rank' , 'username' , 'country_code' , 'main_keymode' , 'total_hits' ]].head(10 )
rank
username
country_code
main_keymode
total_hits
576
Ery
US
4k
106949116
14
bojii
PH
7k
101279111
246
JDS20
CO
7k
100908913
348
masaya
NO
7k
98022770
102
[MYSTIC]
KR
7k
96114018
36
Arona
PH
7k
94566848
307
654564
KR
7k
94005977
475
X_Devil
RU
7k
89904830
34
SillyFangirl
BR
7k
86172553
351
Dale940
CA
7k
85310822
我自己只有3000万的击打次数,有没有可能是这些人特别肝呢?画个箱线图(boxplot)看看
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 def box_plot (data, edge_color, fill_color ): bp = ax.boxplot(data, patch_artist=True , vert=False , widths=0.4 ) for element in ['boxes' , 'whiskers' , 'fliers' , 'means' , 'medians' , 'caps' ]: plt.setp(bp[element], color=edge_color) for patch in bp['boxes' ]: patch.set (facecolor=fill_color) return bp plt.rcParams["figure.figsize" ] = (6 ,2 ) df_draw = df.sort_values('total_hits' , ascending=False , ignore_index=True ) fig, ax = plt.subplots() box_plot(df_draw['total_hits' ], 'blue' , 'cyan' ) plt.tick_params(left = False , labelleft = False ) plt.gca().xaxis.grid(True ) ax.set_title("osu!mania top #1000 total hits" ) ax.set_xlabel('Total Hits (millions)' ) plt.ticklabel_format(style='sci' , axis='x' , scilimits=(6 ,6 )) plt.show()
很显然,刚才看到的前10都是肝帝中的肝帝,全部是outlier,实际上中位数只有2600万
国人居然没有上榜,那就看看国内的击打次数前十
1 2 df_draw = df.sort_values('total_hits' , ascending=False , ignore_index=True ) df_draw.loc[df_draw['country_code' ] == "CN" ].loc[:, ['rank' , 'username' , 'country_code' , 'main_keymode' , 'total_hits' ]].head(10 )
rank
username
country_code
main_keymode
total_hits
266
Mafuyu87Fanboy
CN
7k
77677383
122
StinkGod
CN
7k
68899290
734
Carpihat
CN
4k
63270291
208
ExNeko
CN
7k
60870441
193
MyAngelYukee7
CN
7k
52384916
454
MitoVan
CN
7k
51822079
222
ChenutBS
CN
7k
51703019
320
[GB]KingFish
CN
7k
47713907
958
[GB]AelSan
CN
4k
46657709
40
[Crz]Reimu
CN
7k
46443924
第一是龇牙雪糕,不是很意外,他bonus pp都刷到400了,而且他在私服还有1700万的击打次数。
实际上osu还有另一个指标,就是总游戏时长(Total Play Time),似乎是只计算打图的时间,反正ppy在一个六年前的帖子 里是这样说的。
ppy: Counts total seconds actually spent playing a map (including breaks). Calculated retroactively, factoring in retries, exits and passes to roughly 95% accuracy.
1 2 3 4 df_draw = df.sort_values('play_time' , ascending=False , ignore_index=True ) df_draw = df_draw.loc[:, ['rank' , 'username' , 'country_code' , 'main_keymode' , 'play_time' ]].head(10 ) df_draw['play_time' ] = df_draw['play_time' ].apply(lambda x: str (round (x / 3600 / 24 , 2 )) + ' days' ) df_draw
rank
username
country_code
main_keymode
play_time
576
Ery
US
4k
87.81 days
266
Mafuyu87Fanboy
CN
7k
77.3 days
348
masaya
NO
7k
77.06 days
927
DUELODER
AR
7k
75.86 days
246
JDS20
CO
7k
73.99 days
873
palmEuEi
TH
4k
72.96 days
297
hisaella
EE
7k
70.62 days
307
654564
KR
7k
70.4 days
475
X_Devil
RU
7k
69.61 days
139
lxLucasxl
AR
7k
69.42 days
画图代码和上面的几乎一样,就是要记得先预处理下play_time
,因为爬到的数据都是秒,转换成天数会更直观。
1 df_draw['play_time' ] = df_draw['play_time' ].apply(lambda x: round (x / 3600 / 24 , 2 ))
可以看到跟击打次数一样,前排玩家都是肝帝,中位数只有20天
国榜的代码跟前面的差不多,就不写了
rank
username
country_code
main_keymode
play_time
266
Mafuyu87Fanboy
CN
7k
77.3 days
734
Carpihat
CN
4k
55.05 days
122
StinkGod
CN
7k
53.15 days
208
ExNeko
CN
7k
49.5 days
454
MitoVan
CN
7k
44.84 days
958
[GB]AelSan
CN
4k
40.82 days
193
MyAngelYukee7
CN
7k
39.88 days
222
ChenutBS
CN
7k
33.88 days
320
[GB]KingFish
CN
7k
33.77 days
770
tito31158
CN
7k
33.35 days
注意到一个挺有意思的点,两个boxplot的lower fence都接近0,估计是从其它游戏过来的大佬,让我看看是谁
1 2 3 4 df_draw = df.sort_values('total_hits' , ascending=True , ignore_index=True ) df_draw = df_draw.loc[:, ['rank' , 'username' , 'country_code' , 'main_keymode' , 'play_count' , 'total_hits' , 'play_time' ]].head(10 ) df_draw['play_time' ] = df_draw['play_time' ].apply(lambda x: str (round (x / 3600 / 24 , 2 )) + ' days' ) df_draw
rank
username
country_code
main_keymode
play_count
total_hits
play_time
992
CDHBS
KR
7k
230
375558
0.29 days
697
020101
JP
7k
225
384465
0.33 days
592
2222
SE
7k
527
712666
0.58 days
572
Sujin97
KR
7k
450
972776
0.6 days
602
ckguswjd9
KR
7k
832
1360380
0.98 days
101
Lalalalak
KR
7k
744
1373734
0.77 days
453
muchinuser
KR
7k
1126
1505659
0.94 days
715
heavyweapon9
PH
7k
704
1603080
1.07 days
87
TOXICBYTE
ID
7k
794
1648476
1.17 days
972
NEON573
KR
7k
1045
1753878
1.32 days
看了下uid,有一半都是一年之内注册的号,而且无一例外全都是7k玩家。这样刷pp不会被系统ban吗?问得好,我也不知道,根据以往经验,刚转来osu!mania玩的时候不能刷的太猛。。。
还有一个play_count
,我个人认为这个数据注水成分比较大,就不看了。
0x05 - 多面手
突发奇想,看看谁是4k最强的7k玩家
1 2 3 4 df_draw = df.copy() df_draw = df_draw.loc[df_draw['main_keymode' ] == '7k' ] df_draw = df_draw.sort_values('pp_4k' , ascending=False , ignore_index=True ) df_draw.loc[:, ['rank' , 'username' , 'country_code' , 'pp_4k' , 'pp_7k' ]].head(10 )
rank
username
country_code
pp_4k
pp_7k
14
bojii
PH
14018
19121
34
SillyFangirl
BR
13871
17703
8
Kalkai
KR
13427
20326
33
yeonho7028
KR
13387
17740
92
CappK
NZ
13050
14378
116
grillroasted
CZ
12624
13919
120
Papago
KR
12164
13825
83
SnowScent
KR
11956
14896
118
DawnX
CN
11913
14188
46
blueBloody
KR
11842
16689
bojii第一,不出意外,目前4k和7k都登峰造极的怪物。但是感觉眼熟的大佬不是很多,可能大部分都比较低调吧
反过来看看谁是7k最强的4k玩家,代码一样就不贴了
rank
username
country_code
pp_4k
pp_7k
115
instal
TH
14419
11740
179
Yeoul
KR
12632
11719
235
[Crz]Ha0201
TW
12012
11562
355
[Crz]Yokniz
CN
11221
11097
256
gaesol
KR
12339
11067
267
xxxxxx2800
MY
12063
10983
275
[Crz]sunnyxxy
CN
11430
10881
408
Alicemana_v
CN
10851
10414
489
pimoux
FR
10950
9999
421
KalkaiFanboy
KR
11290
9900
instal是最近一年多崛起的泰国糊王,目前的4k pp第一,领先第二的Myuka 100pp。看了下他YouTube频道,Gamma +DT (od0) 94.36% ,Epsilon x1.2 (od0) 95.03% ,太猛了!!给不玩4k的解释一下,Epsilon原速目前国内还没有能96%的,他这个x1.2倍速虽然是od0,95%也是非常无敌的成绩了。
0x06 - 谱面和关注数量
osu在去年这个时候推出了一个新的功能,类似于subscribe,只要谱师上传新图,就会收到通知。那么,上传的谱面越多,关注我的人就会跟着变多吗?
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 df_draw = df.copy() df_draw['total_maps' ] = df_draw['graveyard_count' ] + df_draw['ranked_count' ] + df_draw['loved_count' ] df_draw = df_draw.loc[:, ['follower_count' , 'total_maps' ]] x = df_draw['total_maps' ] y = df_draw['follower_count' ] fig, ax = plt.subplots() plt.scatter(x, y) plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1 ))(np.unique(x)), color='r' ) plt.grid(True ) ax.set_title("osu!mania top #1000 beatmaps - subscriber" ) ax.set_xlabel('Uploaded Beatmaps' ) ax.set_ylabel('Subscriber Count' ) ax.set_axisbelow(True ) plt.show() df_draw.corr() follower_count total_maps follower_count 1.000000 0.419528 total_maps 0.419528 1.000000
乍一看相关性似乎不是很强,简单线性回归一下发现R-squared
只有0.176,相当的低,说明拟合度很差。其实从图表也可以看出来,明显是有少数人气王和肝帝在干扰数据,把outlier去除之后再跑一次看看。这里使用了 follower_count < 300 和 total_maps < 200 来过滤掉outlier。
1 2 3 4 5 6 df_draw = df_draw.loc[(df_draw['follower_count' ] < 300 ) & (df_draw['total_maps' ] < 200 )].loc[:, ['follower_count' , 'total_maps' ]] follower_count total_maps follower_count 1.000000 0.696841 total_maps 0.696841 1.000000
有种拨云见日的感觉,再简单跑下回归
1 2 3 import statsmodels.formula.api as smresult = sm.ols(formula="follower_count ~ total_maps" , data=df_draw).fit() print (result.summary())
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 OLS Regression Results ============================================================================== Dep. Variable: follower_count R-squared: 0.486 Model: OLS Adj. R-squared: 0.485 Method: Least Squares F-statistic: 931.7 Date: Mon, 03 Oct 2022 Prob (F-statistic): 1.24e-144 Time: 17:08:17 Log-Likelihood: -4316.2 No. Observations: 989 AIC: 8636. Df Residuals: 987 BIC: 8646. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.7596 0.676 2.601 0.009 0.432 3.087 total_maps 0.7870 0.026 30.524 0.000 0.736 0.838 ============================================================================== Omnibus: 826.723 Durbin-Watson: 1.958 Prob(Omnibus): 0.000 Jarque-Bera (JB): 24320.523 Skew: 3.687 Prob(JB): 0.00 Kurtosis: 26.147 Cond. No. 29.3 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
简单解读一下:
预测模型:follower_count = 1.7596 + total_maps * 0.7870
R-squared = 0.486,可以简单理解成模型精确度
p-value < 0.05,代表有统计显著性(statistical significance)
也就是说,只要你不是上传500张谱的肝帝(qodtjr,Flexo123),或者自带海量人气(Jakads,Andere),那确实是上传的谱面越多,关注订阅数量也越多,天道酬勤。这个Andere我没听说过,但是稍微查了下,他在YouTube有接近10万订阅,也就不难解释这离谱的follower数量了。
最后稍微预测一下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 data = [10 ,20 ,30 ,40 ,50 ,60 ] df_test = pd.DataFrame(data, columns=['total_maps' ]) df_test.reset_index() df_test['follower_count' ] = result.predict(df_test) df_test total_maps follower_count 0 10 9.629600 1 20 17.499600 2 30 25.369600 3 40 33.239600 4 50 41.109600 5 60 48.979601
0x07 - 回放次数
1 2 df_draw = df.sort_values('replays_count' , ascending=False , ignore_index=True ) df_draw.loc[:, ['rank' , 'username' , 'country_code' , 'main_keymode' , 'replays_count' ]].head(10 )
rank
username
country_code
main_keymode
replays_count
2
Jakads
KR
7k
1023293
106
inteliser
JP
7k
322858
72
OutLast
KR
7k
299622
444
Majesty
KR
7k
272202
128
[GS]Myuka
CL
4k
220485
547
gosy777
KR
7k
197294
470
Gon
MY
7k
163459
247
Lothus
BR
7k
158777
14
bojii
PH
7k
155693
82
Estonians
KR
7k
151102
没啥好说的,都是佬中佬,不过我更关心大部分高手都有多少回放观看次数。
1 2 3 4 5 6 7 8 9 10 df_draw = df.sort_values('replays_count' , ascending=False , ignore_index=True ) df_draw['replays_count' ] = df_draw['replays_count' ].apply(lambda x: np.log(x)) fig, ax = plt.subplots() box_plot(df_draw['replays_count' ], 'blue' , 'cyan' ) plt.tick_params(left = False , labelleft = False ) plt.gca().xaxis.grid(True ) ax.set_title("osu!mania top #1000 replay count" ) ax.set_xlabel('Replay Count (e^x)' ) plt.show() df_draw['replays_count' ].median()
这里用ln处理了下,要不然这图全都挤到左边,完全没法看。中位数是141.5,看来是我太菜了,完全没人看回放嘛。。。
0x08 - 好友数量
osu的好友是单向关注的(绿色),如果互关了(mutual)就会变成粉色。这里统计的是个人主页显示的好友数量,也就是有多少人关注了你。
1 2 df_draw = df.sort_values('friend_count' , ascending=False , ignore_index=True ) df_draw.loc[:, ['rank' , 'username' , 'country_code' , 'main_keymode' , 'friend_count' ]].head(10 )
rank
username
country_code
main_keymode
friend_count
2
Jakads
KR
7k
15317
34
SillyFangirl
BR
7k
8316
128
[GS]Myuka
CL
4k
3619
748
Andere
CL
7k
3458
14
bojii
PH
7k
2744
106
inteliser
JP
7k
2595
131
arcwinolivirus
PH
7k
2234
176
Motion
KR
7k
2164
1
dressurf
KR
7k
1896
204
sunix
MX
7k
1871
1 2 3 4 5 6 7 8 9 10 df_draw = df.sort_values('friend_count' , ascending=False , ignore_index=True ) df_draw['friend_count' ] = df_draw['friend_count' ].apply(lambda x: np.log(x)) fig, ax = plt.subplots() box_plot(df_draw['friend_count' ], 'blue' , 'cyan' ) plt.tick_params(left = False , labelleft = False ) plt.gca().xaxis.grid(True ) ax.set_title("osu!mania top #1000 friend count" ) ax.set_xlabel('Friend Count (e^x)' ) plt.show() df_draw['friend_count' ].median()
因为Jakads的存在,这里同样用ln处理了下。中位数是117.5,看来我不仅菜,还没什么好友。。。
注意到有outlier处于0的位置上,让我看看哪些低调大佬打进三位数还没好友的👀
1 2 df_draw = df.sort_values(['friend_count' , 'rank' ], ascending=True , ignore_index=True ) df_draw.loc[df_draw['friend_count' ] == 0 ].loc[:, ['rank' , 'username' , 'country_code' , 'friend_count' ]]
rank
username
country_code
play_count
697
020101
JP
225
731
Okusann Tifa
CN
9987
825
imrrr
KR
5209
看了下,无一例外都是7k人,可能都是从别的游戏转过来的吧。如果不发视频,不加群,也不联机,确实不容易被关注到。
总结
这次复习了爬虫,基础的统计学知识,还有pandas和matplotlib,许久不用确实有些生疏了。就先这样吧,之后想到什么有意思的内容会补充。