Pass: nnga
import re
import pandas as pd
text = """Sed ut perspiciatis, unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam eaque ipsa, quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt, explicabo. Nemo enim ipsam voluptatem, quia voluptas sit, aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos, qui ratione voluptatem sequi nesciunt, neque porro quisquam est, qui dolorem ipsum, quia dolor sit, amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt, ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit, qui in ea voluptate velit esse, quam nihil molestiae consequatur, vel illum, qui dolorem eum fugiat, quo voluptas nulla pariatur? At vero eos et accusamus et iusto odio dignissimos ducimus, qui blanditiis praesentium voluptatum deleniti atque corrupti, quos dolores et quas molestias excepturi sint, obcaecati cupiditate non provident, similique sunt in culpa, qui officia deserunt mollitia animi, id est laborum et dolorum fuga. Et harum quidem rerum facilis est et expedita distinctio. Nam libero tempore, cum soluta nobis est eligendi optio, cumque nihil impedit, quo minus id, quod maxime placeat, facere possimus, omnis voluptas assumenda est, omnis dolor repellendus. Temporibus autem quibusdam et aut officiis debitis aut rerum necessitatibus saepe eveniet, ut et voluptates repudiandae sint et molestiae non recusandae. Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat."""
re.split(r'\W+', text)
result = re.search('[abcdefg]+', text)
result.group(0)
%matplotlib inline
positive = pd.read_csv('positive.csv', sep=";", header=None)
negative = pd.read_csv('negative.csv', sep=";", header=None)
positive.shape, negative.shape
text_data = pd.concat([positive[[3,4]], negative[[3,4]]], ignore_index=True)
# pd.merge(df1, df2, how='left', left_on='value_type', right_index=True)
text_data.columns = ['tweet', 'mood']
text_data.tweet = text_data.tweet.str.lower()
%matplotlib inline
text_data.tweet.str.len()\
.value_counts()\
.sort_index()
text_data = text_data[text_data.tweet.str.len().between(41,140)]
text_data.tweet.str.len()\
.value_counts()\
.sort_index()
text_data.tweet.str.len().hist(bins=30)
text_data.tweet.str.findall('\w+').apply(len).value_counts().sort_index()
text_data.tweet.str[3:5]
positive[3].str.findall('\w+').apply(len).hist(bins=36)
positive.describe()
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
max_features=n_features,
stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
max_features=n_features,
stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()
import pymystem3
from functools import lru_cache
mystem = pymystem3.Mystem()
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
morph.parse('Собаками')
morph.parse('стали')
@lru_cache(maxsize=None)
def lemma_one_word(x):
try:
return mystem.lemmatize(x)[0]
except:
print(x)
return " "
def lemma_one_sentence(x):
return " ".join([lemma_one_word(y) for y in x.split(' ')])
def make_lemma(text, mystem):
lemmas = text.str.findall('(\w+)').apply(lambda x: ' '.join(x)).apply(lemma_one_sentence)
lemmas = lemmas.reset_index(drop=True)
return lemmas
from functools import lru_cache
@lru_cache(maxsize=None)
def lemmatize_mystem(w):
return mystem.lemmatize(w)[0]
@lru_cache(maxsize=None)
def lemmatize_pymorphy(w):
try:
return morph.parse(w)[0].normal_form
except:
return ' '
lemmatize_mystem('стали'), lemmatize_pymorphy('стали')
sample = text_data.tweet.str.findall('\w+').head(50000)
%%time
text_data['mystem_lemma'] = text_data.tweet.str.findall('\w+')\
.apply(lambda x: [lemmatize_mystem(y) for y in x])
%%time
text_data['pymorphy_lemma'] = text_data.tweet.str.findall('\w+')\
.apply(lambda x: [lemmatize_pymorphy(y) for y in x])
text_data = text_data.sample(10000)
text_data.mystem_lemma = text_data.mystem_lemma.apply(lambda x: ' '.join(x))
text_data.mystem_lemma = text_data.mystem_lemma.str.replace('\n', '')
text_data.pymorphy_lemma = text_data.pymorphy_lemma.apply(lambda x: ' '.join(x))
tf = CountVectorizer()
tfidf = TfidfVectorizer()
mystem_X = tf.fit_transform(text_data.mystem_lemma)
pymorphy_X = tf.fit_transform(text_data.pymorphy_lemma)
lda = LatentDirichletAllocation(n_components=10, n_jobs=-1, learning_method='batch', batch_size=256)
mystem_X_lda = lda.fit_transform(mystem_X)
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(20,5), solver='adam')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mystem_X_lda, text_data.mood, test_size=0.3)
model.fit(X_train, y_train)
pd.DataFrame(model.predict_proba(X_train))[0].hist(bins=30)
pd.DataFrame(model.predict_proba(X_test))[0].hist(bins=30)