Importing necessary packages
1
2
3
4
| import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
Count Vectorizer turns sentences into word counts
1
| corpus = ['This is first sentence', 'Here is the second sentence', 'Third sentence']
|
1
2
| count_vec = CountVectorizer()
features = count_vec.fit_transform(corpus)
|
1
| pd.DataFrame(features.todense(), columns=count_vec.get_feature_names())
|
|
first |
here |
is |
second |
sentence |
the |
third |
this |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
2 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
TFIDF Vectorizer turns sentences into vectors using probabilities
1
2
| tfidf = TfidfVectorizer()
features_tfidf = tfidf.fit_transform(corpus)
|
1
| pd.DataFrame(features_tfidf.todense(), columns=tfidf.get_feature_names())
|
|
first |
here |
is |
second |
sentence |
the |
third |
this |
0 |
0.584483 |
0.000000 |
0.444514 |
0.000000 |
0.345205 |
0.000000 |
0.000000 |
0.584483 |
1 |
0.000000 |
0.504611 |
0.383770 |
0.504611 |
0.298032 |
0.504611 |
0.000000 |
0.000000 |
2 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.508542 |
0.000000 |
0.861037 |
0.000000 |