from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
# This produces a feature matrix of token counts, similar to what
# CountVectorizer would produce on text.
X, _ = make_multilabel_classification(random_state=0)
lda = LatentDirichletAllocation(n_components=5, random_state=0)
print(lda.fit(X))
# get topics for some given samples:
print(lda.transform(X[-2:]))
输出
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
evaluate_every=-1, learning_decay=0.7,
learning_method='batch', learning_offset=10.0,
max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
n_components=5, n_jobs=None, n_topics=None, perp_tol=0.1,
random_state=0, topic_word_prior=None,
total_samples=1000000.0, verbose=0)
[[0.00360392 0.25499205 0.0036211 0.64236448 0.09541846]
[0.15297572 0.00362644 0.44412786 0.39568399 0.003586 ]]