updated -> 2024-01-02
From now on it’s the underlying code (scraping part not included)
- clean up student list – from bluearchive.gg
Inย [1]:
import json
import os
import pandas as pd
from IPython.display import display
from tqdm.notebook import tqdm
with open('./data/etc/ba_char.txt', 'r+', encoding='utf-8') as f:
char_list = [char.strip() for char in f.readlines()]
char_list = list(set([char.strip().split(' ')[0].lower() for char in char_list]))
char_list += ['aris', 'arisu']
print(len(char_list))
char_list
165
Out[1]:
['kayoko', 'sensei', 'master', 'unknown', 'arona', 'hare', 'nonomi', 'shokuhou', 'beatrice', 'iori', 'megu', 'meru', 'alice', 'kuzunoha', 'hifumi', 'yukari', 'tsurugi', 'nyanten-maru', 'phrenapates', 'saya', 'rumi', 'otogi', 'kaede', 'ibuki', 'iroha', 'eimi', 'karin', 'satsuki', 'akari', 'kai', 'sumomo', 'shiba', 'yuzu', 'makoto', 'wakamo', 'ayumu', 'haruna', 'saki', 'ruiko', 'rin', 'izumi', 'himari', 'niko', 'midori', 'ayane', 'aoi', 'gsc', 'erika', 'koharu', 'fubuki', 'kirino', 'takane', 'kisaki', 'akane', 'minori', 'hanae', 'maki', 'shinon', 'marina', 'decalcomanie', 'saten', 'reijo', 'sena', 'koyuki', 'reisa', 'renge', 'miku', 'mina', 'kaya', 'azusa', 'haruka', 'misaki', 'golconde', 'hibiki', 'mashiro', 'suzumi', 'shiroko', 'plana', 'maestro', 'kotama', 'mikoto', 'tomoe', 'sora', 'serina', 'umika', 'nagisa', 'sakurako', 'ako', 'momoka', 'niya', 'izuna', 'kurumi', 'junko', 'hiyori', 'shimiko', 'mimori', 'nodoka', 'atsuko', 'aru', 'kirara', 'hina', 'michiru', 'chihiro', 'shizuko', 'chinatsu', 'akira', 'mari', 'asuna', 'haine', 'momoi', 'kasumi', 'shigure', 'tsubaki', 'kikyou', 'nagusa', 'mai', 'yuuka', 'utaha', 'natsu', 'francis', 'airi', 'miyako', 'noa', 'chise', 'tsukuyo', 'hasumi', 'ui', 'hoshino', 'juri', 'ichika', 'cherino', 'serika', 'owner', 'moe', 'rabu', 'sumire', 'rio', 'hanako', 'kazusa', 'hinata', 'mutsuki', 'yukino', 'descartes', 'kanna', 'kokona', 'suzume', 'kotori', 'shun', 'miyu', 'saori', 'fuuka', 'seia', 'yakumo', 'pina', 'mika', 'toki', 'yoshimi', 'mine', 'misaka', 'black', 'kaho', 'neru', 'momiji', 'aris', 'arisu']
Inย [111]:
file_list = [f for f in os.listdir('./data') if f.endswith('.json')]
file_list.sort()
print(file_list)
data_l = list()
for file in tqdm(file_list):
data = pd.read_json(f'./data/{file}', orient='index')[['author', 'created_utc', 'title', 'over_18', 'link_flair_text', 'comments']]
data_l.append(data)
data = pd.concat(data_l)
# display(data)
['01.json', '02.json', '03.json', '04.json', '05.json', '06.json', '07.json', '08.json', '09.json', '10.json', '11.json', '12.json']
0%| | 0/12 [00:00<?, ?it/s]
Inย [113]:
data.comments = data.comments.map(lambda inp: [comment['body'] for comment in inp])
display(data)
author | created_utc | title | over_18 | link_flair_text | comments | |
---|---|---|---|---|---|---|
106eh24 | supreme_freshlord | 1673167999 | Will Blue Archive run out of names? | False | Discussion | [A quick search on google for a Japanese nameb… |
106e6ye | Sterbezz | 1673166949 | Eh? | False | Post Removed โ (RL-12) Posting Quality Assurance | [Thank you for your submission! Unfortunately,… |
106dgnk | ampzzzz | 1673164358 | Bunny Karin & Asuna – by LEMONECO | True | NON OC ART | [\nYour submission was automatically removed f… |
106clfu | OriginalDCD | 1673161276 | [Double Drop Campaign Preview] The amount of r… | False | EN/KR/TW/TH โ News | [I ain’t complaining I could really use some b… |
106c0z1 | AnksDilxMC | 1673159357 | Bocchi the Rock ending 3 but it’s the Game Dev… | False | NON OC ART | [###[View link](https://redditsave.com/info?ur… |
… | … | … | … | … | … | … |
18ph464 | Accomplished-Joke554 | 1703371646 | The Saiba twins be gaming after bed time (Mido… | False | NON OC ART | [Five Nights at Seminar would be more apt, if … |
18pgvwv | NegressorSapiens | 1703370944 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [what a weird combinations? ah, this must be o… |
18pgn3e | mtparanal | 1703370193 | Kanna (1st year) [by Aldi Fauzan] | False | NON OC ART | [Man, Kanna is so cool., Sauce: [pixiv](https:… |
18pgk5d | Go_Fcks_Yrslf_1514 | 1703369956 | You’re one the female student of kivotos who f… | False | Discussion | [Your submission was automatically removed.\n\… |
18pfar3 | NegressorSapiens | 1703366253 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [*Don’t worry about Ojisan here; she’s just do… |
35032 rows ร 6 columns
Inย [114]:
count = 0
for comments in data.comments:
count += len(comments)
count
Out[114]:
534007
getting the flair typesยถ
Inย [115]:
import matplotlib.pyplot as plt
def clean_flair(flair_data, cut=500):
print('\nbefore:', len(flair_data))
# combine all REMOVED flair
removed = 0
removed_idx = list()
for i in range(len(flair_data)):
if 'removed' in flair_data.index[i].lower():
removed += flair_data.iloc[i]
removed_idx.append(flair_data.index[i])
# combine all flair smaller than <cut>
flair_data_clean = flair_data.drop(removed_idx)[flair_data >= cut]
etc = flair_data.drop(removed_idx)[flair_data < cut].sum()
flair_data_clean = pd.concat([flair_data_clean, pd.Series([removed], index=['Removed'])])
flair_data_clean = pd.concat([flair_data_clean, pd.Series([etc], index=['ETC'])])
print('after:', len(flair_data_clean))
print('removed:', removed)
print()
return flair_data_clean
Inย [116]:
flair_data = data.link_flair_text.value_counts()
flair_data_clean = clean_flair(flair_data)
print(flair_data_clean)
y = flair_data_clean
mylabels = flair_data_clean.index
def make_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{p:.2f}% ({v:d})'.format(p=pct,v=val)
return my_autopct
plt.pie(y, labels = mylabels, startangle = 90, autopct=make_autopct(y), rotatelabels = True, pctdistance = .8)
plt.show()
before: 137 after: 8 removed: 1808 NON OC ART 13307 OC ART 4250 Comic/TL 4158 BA Meme / Video meme 3002 Discussion 2646 General 2076 Removed 1808 ETC 3539 dtype: int64
getting the over_18 percentageยถ
Inย [117]:
flair_data = data.over_18.value_counts()
print(data.over_18.value_counts())
over_18 False 28684 True 6348 Name: count, dtype: int64
Inย [118]:
# Calculate percentage for each flair
flair_counts = data['link_flair_text'].value_counts(normalize=True) * 100
# Calculate proportion of over_18 for each flair
over_18_proportion = data.groupby('link_flair_text')['over_18'].mean() * 100
# Calculate proportion of over_18 for total
over_18_proportion_total = over_18_proportion.multiply(flair_counts) / 100
# Merging the results
result = pd.concat([flair_counts, over_18_proportion, over_18_proportion_total], axis=1)
result.columns = ['Flair Percentage', 'Over 18 Proportion', 'Over 18 proportion for total']
display(result.head(10))
result.to_csv('./data/etc/flairs.csv')
Flair Percentage | Over 18 Proportion | Over 18 proportion for total | |
---|---|---|---|
link_flair_text | |||
NON OC ART | 38.253895 | 31.810325 | 12.168689 |
OC ART | 12.217559 | 23.835294 | 2.912091 |
Comic/TL | 11.953085 | 8.489658 | 1.014776 |
BA Meme / Video meme | 8.629909 | 2.964690 | 0.255850 |
Discussion | 7.606508 | 0.982615 | 0.074743 |
General | 5.967918 | 2.167630 | 0.129362 |
Megathread | 1.422986 | 0.000000 | 0.000000 |
Fan Fiction | 1.420112 | 2.429150 | 0.034497 |
EN/KR/TW/TH โ News | 1.351118 | 0.212766 | 0.002875 |
JP โ News | 1.244754 | 2.309469 | 0.028747 |
Artwork categorizing for submissionsยถ
Inย [119]:
from nltk.tokenize import TweetTokenizer # seems suitable for Reddit? also for TW artist sourcing!
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('punkt')
tk = word_tokenize
ttk = TweetTokenizer()
wnl = WordNetLemmatizer()
tks = [ [ wnl.lemmatize(word).lower() for word in ttk.tokenize(title)] for title in tqdm(data.title)]
# tks = [(idx, [ wnl.lemmatize(word).lower() for word in ttk.tokenize(data.iloc[i].title)] ) for i, idx in tqdm(enumerate(data.index))]
# print(len(tks))
# print(tks[:5])
print(tks[:5])
[nltk_data] Downloading package wordnet to /home/maxjo/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to /home/maxjo/nltk_data... [nltk_data] Package punkt is already up-to-date!
0%| | 0/35032 [00:00<?, ?it/s]
[['will', 'blue', 'archive', 'run', 'out', 'of', 'name', '?'], ['eh', '?'], ['bunny', 'karin', '&', 'asuna', '-', 'by', 'lemoneco'], ['[', 'double', 'drop', 'campaign', 'preview', ']', 'the', 'amount', 'of', 'reward', 'dropped', 'in', 'missions', 'will', 'be', 'doubled', '.'], ['bocchi', 'the', 'rock', 'ending', '3', 'but', "it's", 'the', 'game', 'development', 'department', '(', 'by', '@dkwlro', 'on', 'twitter', ')']]
Inย [120]:
# TweetTokenizer can't tokenize 's (eg: it's -> it + 's) so manual tokenization needed
# tks = [([word[:-2] if word.endswith("'s") else word for word in title]) for post in tks]
tks = [ [word[:-2] if word.endswith("'s") else word for word in title] for title in tks]
print('\n', tks[:5])
[['will', 'blue', 'archive', 'run', 'out', 'of', 'name', '?'], ['eh', '?'], ['bunny', 'karin', '&', 'asuna', '-', 'by', 'lemoneco'], ['[', 'double', 'drop', 'campaign', 'preview', ']', 'the', 'amount', 'of', 'reward', 'dropped', 'in', 'missions', 'will', 'be', 'doubled', '.'], ['bocchi', 'the', 'rock', 'ending', '3', 'but', 'it', 'the', 'game', 'development', 'department', '(', 'by', '@dkwlro', 'on', 'twitter', ')']]
Inย [121]:
data['tks'] = tks
display(data)
author | created_utc | title | over_18 | link_flair_text | comments | tks | |
---|---|---|---|---|---|---|---|
106eh24 | supreme_freshlord | 1673167999 | Will Blue Archive run out of names? | False | Discussion | [A quick search on google for a Japanese nameb… | [will, blue, archive, run, out, of, name, ?] |
106e6ye | Sterbezz | 1673166949 | Eh? | False | Post Removed โ (RL-12) Posting Quality Assurance | [Thank you for your submission! Unfortunately,… | [eh, ?] |
106dgnk | ampzzzz | 1673164358 | Bunny Karin & Asuna – by LEMONECO | True | NON OC ART | [\nYour submission was automatically removed f… | [bunny, karin, &, asuna, -, by, lemoneco] |
106clfu | OriginalDCD | 1673161276 | [Double Drop Campaign Preview] The amount of r… | False | EN/KR/TW/TH โ News | [I ain’t complaining I could really use some b… | [[, double, drop, campaign, preview, ], the, a… |
106c0z1 | AnksDilxMC | 1673159357 | Bocchi the Rock ending 3 but it’s the Game Dev… | False | NON OC ART | [###[View link](https://redditsave.com/info?ur… | [bocchi, the, rock, ending, 3, but, it, the, g… |
… | … | … | … | … | … | … | … |
18ph464 | Accomplished-Joke554 | 1703371646 | The Saiba twins be gaming after bed time (Mido… | False | NON OC ART | [Five Nights at Seminar would be more apt, if … | [the, saiba, twin, be, gaming, after, bed, tim… |
18pgvwv | NegressorSapiens | 1703370944 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [what a weird combinations? ah, this must be o… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… |
18pgn3e | mtparanal | 1703370193 | Kanna (1st year) [by Aldi Fauzan] | False | NON OC ART | [Man, Kanna is so cool., Sauce: [pixiv](https:… | [kanna, (, 1st, year, ), [, by, aldi, fauzan, ]] |
18pgk5d | Go_Fcks_Yrslf_1514 | 1703369956 | You’re one the female student of kivotos who f… | False | Discussion | [Your submission was automatically removed.\n\… | [you’re, one, the, female, student, of, kivoto… |
18pfar3 | NegressorSapiens | 1703366253 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [*Don’t worry about Ojisan here; she’s just do… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… |
35032 rows ร 7 columns
Inย [122]:
def is_there_student(tks):
return list(set([tk for tk in tks if tk in char_list]))
def find_alias(inp, aliases: list, to: str):
return list(set([to if i in aliases else i for i in inp]))
data['students'] = data['tks'].map(is_there_student)
# find aliases for aris
data['students'] = data['students'].map(lambda inp: find_alias(inp, ['aris', 'alice', 'arisu'], 'aris'))
display(data)
author | created_utc | title | over_18 | link_flair_text | comments | tks | students | |
---|---|---|---|---|---|---|---|---|
106eh24 | supreme_freshlord | 1673167999 | Will Blue Archive run out of names? | False | Discussion | [A quick search on google for a Japanese nameb… | [will, blue, archive, run, out, of, name, ?] | [] |
106e6ye | Sterbezz | 1673166949 | Eh? | False | Post Removed โ (RL-12) Posting Quality Assurance | [Thank you for your submission! Unfortunately,… | [eh, ?] | [] |
106dgnk | ampzzzz | 1673164358 | Bunny Karin & Asuna – by LEMONECO | True | NON OC ART | [\nYour submission was automatically removed f… | [bunny, karin, &, asuna, -, by, lemoneco] | [asuna, karin] |
106clfu | OriginalDCD | 1673161276 | [Double Drop Campaign Preview] The amount of r… | False | EN/KR/TW/TH โ News | [I ain’t complaining I could really use some b… | [[, double, drop, campaign, preview, ], the, a… | [] |
106c0z1 | AnksDilxMC | 1673159357 | Bocchi the Rock ending 3 but it’s the Game Dev… | False | NON OC ART | [###[View link](https://redditsave.com/info?ur… | [bocchi, the, rock, ending, 3, but, it, the, g… | [] |
… | … | … | … | … | … | … | … | … |
18ph464 | Accomplished-Joke554 | 1703371646 | The Saiba twins be gaming after bed time (Mido… | False | NON OC ART | [Five Nights at Seminar would be more apt, if … | [the, saiba, twin, be, gaming, after, bed, tim… | [midori, momoi] |
18pgvwv | NegressorSapiens | 1703370944 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [what a weird combinations? ah, this must be o… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… | [aris, hina, miyako, hoshino] |
18pgn3e | mtparanal | 1703370193 | Kanna (1st year) [by Aldi Fauzan] | False | NON OC ART | [Man, Kanna is so cool., Sauce: [pixiv](https:… | [kanna, (, 1st, year, ), [, by, aldi, fauzan, ]] | [kanna] |
18pgk5d | Go_Fcks_Yrslf_1514 | 1703369956 | You’re one the female student of kivotos who f… | False | Discussion | [Your submission was automatically removed.\n\… | [you’re, one, the, female, student, of, kivoto… | [] |
18pfar3 | NegressorSapiens | 1703366253 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [*Don’t worry about Ojisan here; she’s just do… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… | [aris, hina, miyako, hoshino] |
35032 rows ร 8 columns
‘NON OC ART ‘, ‘OC ART ‘ somehow has a trailing space!
Inย [123]:
artworks = data[ data['link_flair_text'].isin(['NON OC ART ', 'OC ART ', 'Comic/TL']) ]
display(artworks)
artworks_student = pd.Series( [student for students in artworks.students for student in students] ).value_counts()
artworks_student.nlargest(25)
author | created_utc | title | over_18 | link_flair_text | comments | tks | students | |
---|---|---|---|---|---|---|---|---|
106dgnk | ampzzzz | 1673164358 | Bunny Karin & Asuna – by LEMONECO | True | NON OC ART | [\nYour submission was automatically removed f… | [bunny, karin, &, asuna, -, by, lemoneco] | [asuna, karin] |
106c0z1 | AnksDilxMC | 1673159357 | Bocchi the Rock ending 3 but it’s the Game Dev… | False | NON OC ART | [###[View link](https://redditsave.com/info?ur… | [bocchi, the, rock, ending, 3, but, it, the, g… | [] |
106au46 | Soft-Shoe4100 | 1673155578 | Iori and sensei (by @konnyaksankaku ) translat… | False | Comic/TL | [He eliteโฆ ups degenerated all over the place!… | [iori, and, sensei, (, by, @konnyaksankaku, ),… | [iori, sensei] |
106ak90 | Codex28 | 1673154734 | The black market trade for the “good stuff” in… | False | NON OC ART | [I am a drug dealer after all. After my time a… | [the, black, market, trade, for, the, “, good,… | [hiyori, black, koharu] |
1069i9j | MonsterGirlResearch | 1673151586 | Santa Shiroko (Pixiv: Deriolet) | False | NON OC ART | [Hoxton with another banger Christmas song, Wh… | [santa, shiroko, (, pixiv, :, deriolet, )] | [shiroko] |
… | … | … | … | … | … | … | … | … |
18ph4mm | Cnririaldiyby68392 | 1703371687 | Miyako (by @cram960) | False | NON OC ART | [![img](emote|t5_2vhvtt|35651)![img](emote|t5_… | [miyako, (, by, @cram960, )] | [miyako] |
18ph464 | Accomplished-Joke554 | 1703371646 | The Saiba twins be gaming after bed time (Mido… | False | NON OC ART | [Five Nights at Seminar would be more apt, if … | [the, saiba, twin, be, gaming, after, bed, tim… | [midori, momoi] |
18pgvwv | NegressorSapiens | 1703370944 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [what a weird combinations? ah, this must be o… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… | [aris, hina, miyako, hoshino] |
18pgn3e | mtparanal | 1703370193 | Kanna (1st year) [by Aldi Fauzan] | False | NON OC ART | [Man, Kanna is so cool., Sauce: [pixiv](https:… | [kanna, (, 1st, year, ), [, by, aldi, fauzan, ]] | [kanna] |
18pfar3 | NegressorSapiens | 1703366253 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [*Don’t worry about Ojisan here; she’s just do… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… | [aris, hina, miyako, hoshino] |
21715 rows ร 8 columns
Out[123]:
sensei 3295 yuuka 1965 mika 1070 shiroko 1040 aris 960 hina 928 noa 877 rio 832 toki 797 asuna 795 hoshino 694 momoi 571 arona 549 koharu 501 saori 487 midori 468 aru 446 ako 427 kanna 409 kayoko 409 hanako 404 kisaki 392 karin 390 himari 378 hasumi 377 Name: count, dtype: int64
Inย [124]:
sfw_artworks = data[ ~ data['over_18'] & data['link_flair_text'].isin(['NON OC ART ', 'OC ART ', 'Comic/TL']) ]
display(sfw_artworks)
sfw_artworks_student = pd.Series( [student for students in sfw_artworks.students for student in students] ).value_counts()
sfw_artworks_student
author | created_utc | title | over_18 | link_flair_text | comments | tks | students | |
---|---|---|---|---|---|---|---|---|
106c0z1 | AnksDilxMC | 1673159357 | Bocchi the Rock ending 3 but it’s the Game Dev… | False | NON OC ART | [###[View link](https://redditsave.com/info?ur… | [bocchi, the, rock, ending, 3, but, it, the, g… | [] |
106au46 | Soft-Shoe4100 | 1673155578 | Iori and sensei (by @konnyaksankaku ) translat… | False | Comic/TL | [He eliteโฆ ups degenerated all over the place!… | [iori, and, sensei, (, by, @konnyaksankaku, ),… | [iori, sensei] |
106ak90 | Codex28 | 1673154734 | The black market trade for the “good stuff” in… | False | NON OC ART | [I am a drug dealer after all. After my time a… | [the, black, market, trade, for, the, “, good,… | [hiyori, black, koharu] |
1069i9j | MonsterGirlResearch | 1673151586 | Santa Shiroko (Pixiv: Deriolet) | False | NON OC ART | [Hoxton with another banger Christmas song, Wh… | [santa, shiroko, (, pixiv, :, deriolet, )] | [shiroko] |
1067mjy | KURA1439 | 1673146181 | New Year’s Haruna! | False | OC ART | [Count me in, You and me both, [Twitter](https… | [new, year, haruna, !] | [haruna] |
… | … | … | … | … | … | … | … | … |
18ph4mm | Cnririaldiyby68392 | 1703371687 | Miyako (by @cram960) | False | NON OC ART | [![img](emote|t5_2vhvtt|35651)![img](emote|t5_… | [miyako, (, by, @cram960, )] | [miyako] |
18ph464 | Accomplished-Joke554 | 1703371646 | The Saiba twins be gaming after bed time (Mido… | False | NON OC ART | [Five Nights at Seminar would be more apt, if … | [the, saiba, twin, be, gaming, after, bed, tim… | [midori, momoi] |
18pgvwv | NegressorSapiens | 1703370944 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [what a weird combinations? ah, this must be o… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… | [aris, hina, miyako, hoshino] |
18pgn3e | mtparanal | 1703370193 | Kanna (1st year) [by Aldi Fauzan] | False | NON OC ART | [Man, Kanna is so cool., Sauce: [pixiv](https:… | [kanna, (, 1st, year, ), [, by, aldi, fauzan, ]] | [kanna] |
18pfar3 | NegressorSapiens | 1703366253 | Hoshino, Hina, Arisu/Alice & Miyako Sharin… | False | NON OC ART | [*Don’t worry about Ojisan here; she’s just do… | [hoshino, ,, hina, ,, arisu, /, alice, &, miya… | [aris, hina, miyako, hoshino] |
16116 rows ร 8 columns
Out[124]:
sensei 2341 yuuka 1419 mika 894 aris 857 hina 769 ... saten 3 umika 1 francis 1 haine 1 ruiko 1 Name: count, Length: 160, dtype: int64
Inย [125]:
nsfw_artworks = data[ data['over_18'] & data['link_flair_text'].isin(['NON OC ART ', 'OC ART ', 'Comic/TL']) ]
display(nsfw_artworks)
nsfw_artworks_student = pd.Series( [student for students in nsfw_artworks.students for student in students] ).value_counts()
nsfw_artworks_student
author | created_utc | title | over_18 | link_flair_text | comments | tks | students | |
---|---|---|---|---|---|---|---|---|
106dgnk | ampzzzz | 1673164358 | Bunny Karin & Asuna – by LEMONECO | True | NON OC ART | [\nYour submission was automatically removed f… | [bunny, karin, &, asuna, -, by, lemoneco] | [asuna, karin] |
1069i01 | Otoshi_Gami | 1673151563 | Bunny Ui (By @_xaiqi) | True | NON OC ART | [Ui just wants her comfy clothes, but she know… | [bunny, ui, (, by, @_xaiqi, )] | [ui] |
1065vem | SilverTitanium | 1673141256 | Ready to go into the water with Shiroko, Serik… | True | NON OC ART | [Finally, she is no longer atsui yo, The momen… | [ready, to, go, into, the, water, with, shirok… | [ayane, shiroko, hoshino, serika, nonomi] |
105pohb | FurryGoBrrr | 1673100022 | Bunny Utaha by VIRTUALCITY-Ex. | True | NON OC ART | [Holy crapโฆ this is incredible!, Hoooooly., We… | [bunny, utaha, by, virtualcity-ex, .] | [utaha] |
105oxzi | Paiserr | 1673097812 | You’re saving for me, right sensei? (Hoshino b… | True | NON OC ART | [77k . . . . just wait for me., Hahaha…Nope…. | [you’re, saving, for, me, ,, right, sensei, ?,… | [hoshino, sensei] |
… | … | … | … | … | … | … | … | … |
18pqlc1 | PapaAeon | 1703404267 | Hasumi Sports Bunny (@doksyuri) | True | NON OC ART | [] | [hasumi, sports, bunny, (, @doksyuri, )] | [hasumi] |
18po780 | LinkenNightmare | 1703394874 | Swimsuit Shuro, w/ randoseru (@parmdjs) | True | NON OC ART | [So many bandages…\n\nShuro must be properly… | [swimsuit, shuro, ,, w, /, randoseru, (, @parm… | [] |
18pnt1a | wabbitt37 | 1703393480 | Yuuka’s school swimsuit (by awaๆฐ) | True | NON OC ART | [๐ฅ๐ฅ๐ฅ๐ฅโ๏ธโ๏ธ๐ฅ๐ฅ๐ฏ, Thank you Ambatukam, Most sane s… | [yuuka, school, swimsuit, (, by, awaๆฐ, )] | [yuuka] |
18pmuu3 | PichiPik | 1703390147 | Spicy Shiroko Terror colored! (By @PichiPeek) | True | OC ART | [I know that i’m often saying that anything wo… | [spicy, shiroko, terror, colored, !, (, by, @p… | [shiroko] |
18pl0t5 | Jack13515 | 1703383768 | “Merry Christmast, Sensei!” (Asuna and Toki by… | True | NON OC ART | [A ‘stache to rival Great Leader Cherino, [del… | [“, merry, christmast, ,, sensei, !, “, (, asu… | [toki, asuna, sensei] |
5599 rows ร 8 columns
Out[125]:
sensei 954 yuuka 546 asuna 400 rio 331 shiroko 275 ... haine 1 suzumi 1 michiru 1 pina 1 yukari 1 Name: count, Length: 141, dtype: int64
- for use in Flourish
Inย [126]:
fl = sfw_artworks_student.to_frame(name='sfw_artworks')
fl['nsfw_artworks'] = nsfw_artworks_student
fl['nsfw_%'] = round(fl.nsfw_artworks / (fl.nsfw_artworks + fl.sfw_artworks) * 100, 2)
fl
Out[126]:
sfw_artworks | nsfw_artworks | nsfw_% | |
---|---|---|---|
sensei | 2341 | 954.0 | 28.95 |
yuuka | 1419 | 546.0 | 27.79 |
mika | 894 | 176.0 | 16.45 |
aris | 857 | 103.0 | 10.73 |
hina | 769 | 159.0 | 17.13 |
… | … | … | … |
saten | 3 | NaN | NaN |
umika | 1 | NaN | NaN |
francis | 1 | NaN | NaN |
haine | 1 | 1.0 | 50.00 |
ruiko | 1 | NaN | NaN |
160 rows ร 3 columns
Inย [127]:
fl.nlargest(10, 'nsfw_%')
Out[127]:
sfw_artworks | nsfw_artworks | nsfw_% | |
---|---|---|---|
satsuki | 27 | 56.0 | 67.47 |
shizuko | 13 | 19.0 | 59.38 |
master | 28 | 37.0 | 56.92 |
karin | 169 | 221.0 | 56.67 |
mimori | 24 | 30.0 | 55.56 |
tsubaki | 24 | 26.0 | 52.00 |
hanako | 198 | 206.0 | 50.99 |
asuna | 395 | 400.0 | 50.31 |
haine | 1 | 1.0 | 50.00 |
eimi | 99 | 87.0 | 46.77 |
Inย [128]:
fl.to_csv('./data/etc/flourish-art.csv')
find twitter artists
Inย [129]:
artists = pd.Series([ tk for tks in artworks.tks for tk in tks if tk.startswith('@') ]).value_counts()
print(len(artists))
artists.nlargest(10)
3740
Out[129]:
@tonomiya68 80 @donmin_h 61 @asayuki101 60 @wujurana 58 @gogogorx 53 @yukkieeeeeen 51 @ginmei182_e 47 @dltkdrlf92 47 @sin_seishun 45 @blowfish100 40 Name: count, dtype: int64
emoji detectorยถ
Inย [130]:
from nltk.tokenize import TweetTokenizer
ttk = TweetTokenizer()
data_lc = data_l
try:
# can't run this part twice...
for data in data_lc:
data.comments = data.comments.map(lambda inp: [comment['body'] for comment in inp])
except: pass
emoji_data = pd.DataFrame()
# couldn't find an easy way to like is_emoji() or smth
emojis = '๐๐๐๐คฃ๐๐๐
๐๐๐๐๐๐๐๐ฅฐ๐๐๐ฅฒ๐โบ๏ธ๐๐ค๐คฉ๐ค๐ซก๐คจ๐๐๐ถ๐ซฅ๐ถโ๐ซ๏ธ๐๐๐ฃ๐ฅ๐ฎ๐ค๐ฏ๐ช๐ซ๐ฅฑ๐ด๐๐๐๐๐คค๐๐๐๐๐ซค๐๐ซ ๐ค๐ฒโน๏ธ๐๐๐๐๐ค๐ข๐ญ๐ฆ๐ง๐จ๐ฉ๐คฏ๐ฌ๐ฎโ๐จ๐ฐ๐ฑ๐ฅต๐ฅถ๐ณ๐คช๐ต๐ตโ๐ซ๐ฅด๐ ๐ก๐คฌ๐ท๐ค๐ค๐คข๐คฎ๐คง๐๐ฅณ๐ฅธ๐ฅบ๐ฅน๐ค ๐คก๐คฅ๐ซจ๐คซ๐คญ๐ซข๐ซฃ๐ง๐ค๐๐ฟ๐น๐บ๐โ ๏ธ๐ป๐ฝ๐พ๐ค๐ฉ๐ช๐ฆต๐ฆถ๐๐ฆป๐๐ค๐๐โ๏ธ๐ซต๐๐โ๏ธ๐ค๐ซฐ๐๐ซฑ๐ซฒ๐ซณ๐ซด๐ซท๐ซธ๐ค๐ค๐๏ธโ๐๐ค๐๐โ๐๐ค๐ค๐ค๐๐คโ๏ธ๐๐๐๐ซถ๐คฒ๐๐ฆ๐โ๏ธ๐ฅ๐ขโค๏ธ๐ฉท๐งก๐๐๐๐ฉต๐๐ค๐ค๐ฉถ๐ค๐โค๏ธโ๐ฅโค๏ธโ๐ฉนโฃ๏ธ๐๐๐๐๐๐๐๐๐๐ข๐ฅ๐ค๐ฆ๐จ๐ซ๐ณ๏ธโฎ๏ธโโญ๐โโโโโผ๏ธโ๏ธ๐ฏ'
tks_per_month = [ [ [ttk.tokenize(comment) for comment in comments] for comments in m_data.comments] for m_data in tqdm(data_lc) ]
0%| | 0/12 [00:00<?, ?it/s]
Inย [137]:
emote_count = [ pd.Series([tk for p_tk in m_tk for tks in p_tk for tk in tks if tk in emojis]).value_counts() for m_tk in tks_per_month ]
emote_count = [ count.drop('๏ธ') for count in emote_count ]
all_emotes = [ [tk for p_tk in m_tk for tks in p_tk for tk in tks if tk in emojis] for m_tk in tks_per_month ]
count = 0
for m in all_emotes:
count += len(m)
count
Out[137]:
95717
Inย [132]:
for i, m in enumerate(emote_count):
print(f'\n{i+1}: ')
print(m.nlargest(10))
1: ๐ญ 796 ๐ข 369 ๐ 113 ๐ 61 โค 53 ๐ 41 ๐ฆ 39 ๐คฃ 33 ๐ฅฐ 33 ๐ 29 Name: count, dtype: int64 2: ๐ญ 993 ๐ข 492 ๐ 97 ๐คฃ 72 ๐ 63 ๐ฅ 58 ๐ 47 โ 46 ๐ 46 ๐คค 46 Name: count, dtype: int64 3: ๐ญ 1816 ๐ข 703 ๐ 101 ๐ 95 ๐ฅ 85 ๐ 60 ๐ 59 ๐ค 57 โค 56 ๐คค 51 Name: count, dtype: int64 4: ๐ญ 2926 ๐ข 954 ๐ 195 ๐ฅต 149 ๐ณ 104 ๐ 98 ๐ 81 โค 78 ๐ฅ 66 ๐ 64 Name: count, dtype: int64 5: ๐ญ 2708 ๐ข 794 ๐ 138 ๐คค 137 ๐ 109 ๐ฅต 104 ๐ฅ 96 ๐ 88 ๐ 84 โค 62 Name: count, dtype: int64 6: ๐ญ 2885 ๐ข 1334 ๐ฅต 187 ๐ฅ 152 ๐ 140 ๐ฆ 131 ๐ 120 ๐คค 112 ๐ 104 ๐ 90 Name: count, dtype: int64 7: ๐ญ 3138 ๐ข 1277 ๐ฅ 202 ๐ 184 ๐ 140 ๐ 140 ๐ 118 ๐ฅต 117 ๐ 114 ๐ฆ 91 Name: count, dtype: int64 8: ๐ญ 3907 ๐ข 1586 ๐ฅ 507 ๐ 211 ๐ 185 ๐ฅต 177 ๐ 169 โ 168 ๐ฏ 113 ๐ 113 Name: count, dtype: int64 9: ๐ญ 4043 ๐ข 1298 ๐ฅ 492 ๐ 187 ๐ 161 ๐ฅต 159 โ 152 ๐ค 140 ๐ 129 ๐ 116 Name: count, dtype: int64 10: ๐ญ 4748 ๐ข 1413 ๐ฅ 465 ๐ 255 ๐ฅฐ 238 ๐ 213 ๐ 173 ๐บ 163 ๐ 160 ๐ฅต 156 Name: count, dtype: int64 11: ๐ญ 4036 ๐ข 1084 ๐ฅ 434 ๐ฅฐ 383 ๐ 238 ๐คค 223 ๐ 192 ๐ 179 ๐ฅต 158 ๐ 141 Name: count, dtype: int64 12: ๐ญ 7822 ๐ข 2330 ๐ฅ 848 ๐ฅฐ 671 ๐ฅต 430 ๐บ 413 ๐ฆ 358 โ 327 ๐ 315 ๐ 297 Name: count, dtype: int64
Inย [133]:
ttl_emoji = pd.Series()
for m_count in emote_count:
ttl_emoji = ttl_emoji.add(m_count, fill_value=0).astype('int64')
print(ttl_emoji.nlargest(10))
top_20 = ttl_emoji.nlargest(20).index
๐ญ 39818 ๐ข 13634 ๐ฅ 3425 ๐ฅฐ 1778 ๐ 1777 ๐ 1740 ๐ฅต 1708 ๐ 1520 ๐คค 1235 ๐ 1233 dtype: int64
Inย [134]:
from datetime import datetime
emoji_data = pd.DataFrame(columns=top_20)
for i, m in enumerate(emote_count):
r = list()
for emoji in top_20:
try: r.append(m.loc[[emoji]])
except: r.append(None)
emoji_data.loc[datetime(2023,i+1,1)] = pd.concat(r).T
Inย [136]:
emoji_data
Out[136]:
๐ญ | ๐ข | ๐ฅ | ๐ฅฐ | ๐ | ๐ | ๐ฅต | ๐ | ๐คค | ๐ | โ | ๐บ | ๐ค | ๐ | ๐ฆ | โค | ๐ฆ | ๐ณ | ๐ | ๐คฃ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2023-01-01 | 796.0 | 369.0 | 20.0 | 33.0 | 61.0 | 113.0 | 17.0 | 14.0 | 22.0 | 29.0 | NaN | 3.0 | 21.0 | 27.0 | 5.0 | 53.0 | 39.0 | 22.0 | 15.0 | 33.0 |
2023-02-01 | 993.0 | 492.0 | 58.0 | 33.0 | 63.0 | 97.0 | 21.0 | 31.0 | 46.0 | 43.0 | 18.0 | 2.0 | 29.0 | 47.0 | 10.0 | 37.0 | 44.0 | 45.0 | 16.0 | 72.0 |
2023-03-01 | 1816.0 | 703.0 | 85.0 | 49.0 | 95.0 | 101.0 | 33.0 | 28.0 | 51.0 | 59.0 | 45.0 | 23.0 | 57.0 | 60.0 | 13.0 | 56.0 | 47.0 | 44.0 | 21.0 | 48.0 |
2023-04-01 | 2926.0 | 954.0 | 66.0 | 37.0 | 98.0 | 195.0 | 149.0 | 42.0 | 60.0 | 64.0 | 18.0 | 32.0 | 51.0 | 81.0 | 23.0 | 78.0 | 29.0 | 104.0 | 29.0 | 41.0 |
2023-05-01 | 2708.0 | 794.0 | 96.0 | 55.0 | 109.0 | 138.0 | 104.0 | 41.0 | 137.0 | 84.0 | 31.0 | 6.0 | 33.0 | 88.0 | 43.0 | 62.0 | 16.0 | 39.0 | 32.0 | 53.0 |
2023-06-01 | 2885.0 | 1334.0 | 152.0 | 36.0 | 140.0 | 90.0 | 187.0 | 120.0 | 112.0 | 104.0 | 50.0 | 55.0 | 64.0 | 76.0 | 131.0 | 56.0 | 78.0 | 58.0 | 29.0 | 45.0 |
2023-07-01 | 3138.0 | 1277.0 | 202.0 | 57.0 | 184.0 | 140.0 | 117.0 | 140.0 | 67.0 | 114.0 | 50.0 | 31.0 | 43.0 | 118.0 | 15.0 | 60.0 | 91.0 | 64.0 | 30.0 | 76.0 |
2023-08-01 | 3907.0 | 1586.0 | 507.0 | 79.0 | 211.0 | 169.0 | 177.0 | 185.0 | 101.0 | 113.0 | 168.0 | 72.0 | 53.0 | 74.0 | 39.0 | 41.0 | 75.0 | 78.0 | 42.0 | 61.0 |
2023-09-01 | 4043.0 | 1298.0 | 492.0 | 107.0 | 187.0 | 161.0 | 159.0 | 129.0 | 78.0 | 97.0 | 152.0 | 105.0 | 140.0 | 116.0 | 50.0 | 83.0 | 89.0 | 69.0 | 39.0 | 46.0 |
2023-10-01 | 4748.0 | 1413.0 | 465.0 | 238.0 | 173.0 | 213.0 | 156.0 | 255.0 | 106.0 | 160.0 | 125.0 | 163.0 | 110.0 | 101.0 | 67.0 | 68.0 | 44.0 | 56.0 | 106.0 | 67.0 |
2023-11-01 | 4036.0 | 1084.0 | 434.0 | 383.0 | 141.0 | 192.0 | 158.0 | 238.0 | 223.0 | 137.0 | 126.0 | 137.0 | 129.0 | 62.0 | 112.0 | 66.0 | 42.0 | 59.0 | 133.0 | 53.0 |
2023-12-01 | 7822.0 | 2330.0 | 848.0 | 671.0 | 315.0 | 131.0 | 430.0 | 297.0 | 232.0 | 229.0 | 327.0 | 413.0 | 195.0 | 69.0 | 358.0 | 92.0 | 154.0 | 89.0 | 200.0 | 60.0 |
Inย [103]:
emoji_data.to_csv('./data/etc/emojis.csv')
Inย [ย ]: