# -*- coding: utf-8 -*-
"""
Created on Wed Oct 12 08:57:46 2022

@author: Márton
"""

import numpy as np;
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer;

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?'
];

n = len(corpus);  # length of corpus

# Basic counting of terms in docs
count = CountVectorizer();
DT_count_sparse = count.fit_transform(corpus);
DT_count_dense = DT_count_sparse.toarray();
vocabulary = count.get_feature_names();
p = len(vocabulary);

# Boolean counting
bool = CountVectorizer(binary=True);
DT_bool_sparse = bool.fit_transform(corpus);
DT_bool_dense = DT_bool_sparse.toarray();

# Computing doc-term frequencies
tf = DT_count_sparse/np.dot(np.sum(DT_count_sparse,axis=1),np.ones((1,p)));

# Computing term frequencies
term_freq = np.sum(DT_bool_sparse,axis=0);
idf = np.log(n/term_freq);

# Basic tf-idf scoring
tfidf_basic = np.multiply(tf,(np.dot(np.ones((n,1)),idf)));

# tf-idf scoring
tfidf =TfidfVectorizer();
DT_tfidf_sparse = tfidf.fit_transform(corpus);
DT_tfidf_dense = DT_tfidf_sparse.toarray();
diag = np.diag(np.dot(DT_tfidf_dense,DT_tfidf_dense.T));
 