from nltk.tokenize import word_tokenize
word_tokenize("Photosynthesis occurs in plants.")

['Photosynthesis', 'occurs', 'in', 'plants', '.']

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Tokenize the sentence
tokens = word_tokenize("Photosynthesis occurs in plants.")

# Download stop words if not already downloaded
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    import nltk
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# Filter out stop words
filtered_tokens = [w for w in tokens if w.lower() not in stop_words]

print("Original tokens:", tokens)
print("Filtered tokens:", filtered_tokens)

Original tokens: ['Photosynthesis', 'occurs', 'in', 'plants', '.']
Filtered tokens: ['Photosynthesis', 'occurs', 'plants', '.']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "Physics is interesting",
    "Physics and chemistry are sciences"
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())

['and' 'are' 'chemistry' 'interesting' 'is' 'physics' 'sciences']
[[0.         0.         0.         0.6316672  0.6316672  0.44943642
  0.        ]
 [0.47107781 0.47107781 0.47107781 0.         0.         0.33517574
  0.47107781]]

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.

True

sentences = [
    "The cat sits on the mat",
    "The dog sits on the rug",
    "Cats and dogs are pets",
    "Pets are cute and lovely",
    "Students learn physics and chemistry"
]

# Tokenize
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
print(tokenized_sentences)

[['the', 'cat', 'sits', 'on', 'the', 'mat'], ['the', 'dog', 'sits', 'on', 'the', 'rug'], ['cats', 'and', 'dogs', 'are', 'pets'], ['pets', 'are', 'cute', 'and', 'lovely'], ['students', 'learn', 'physics', 'and', 'chemistry']]

model = Word2Vec(
    sentences=tokenized_sentences,
    vector_size=50,      # embedding dimension
    window=2,            # context window size
    min_count=1,         # minimum word frequency to include
    sg=1,                # 1 = skip-gram
    epochs=100           # more epochs for small data
)

# Save if needed
model.save("skipgram_demo.model")

print("Embedding for 'cat':")
print(model.wv['cat'])

Embedding for 'cat':
[-0.01923801  0.01787224  0.00830869  0.01846109  0.01324344  0.00586571
  0.01966685 -0.00883491 -0.01377315  0.00848266  0.00749095 -0.01138246
  0.01945744 -0.00711169  0.01909935  0.0017056  -0.01264617 -0.00376472
 -0.01492901 -0.00611466  0.00212158  0.01901883  0.01884231 -0.01324384
  0.00697738  0.00456381 -0.0048769  -0.01838992  0.00192122 -0.01634843
  0.01265703 -0.0116715   0.01103789  0.01960382 -0.00038975  0.00899734
 -0.00353411  0.01473381  0.00790537 -0.01802165 -0.00478091  0.00724488
 -0.00028623 -0.00237024 -0.00192607 -0.00326875  0.00116864  0.00823352
 -0.00843931 -0.00768588]

print("Words similar to 'cat':")
print(model.wv.most_similar('cat'))

Words similar to 'cat':
[('and', 0.23322492837905884), ('physics', 0.22817462682724), ('on', 0.1325155645608902), ('pets', 0.10684175044298172), ('mat', 0.09860452264547348), ('the', 0.06264279037714005), ('chemistry', 0.05703628063201904), ('dogs', 0.05243419110774994), ('learn', -0.0052363998256623745), ('cats', -0.03842007741332054)]

similarity = model.wv.similarity('cat', 'dog')
print(f"Similarity between 'cat' and 'dog': {similarity}")

Similarity between 'cat' and 'dog': -0.08932257443666458

result = model.wv.most_similar(
    positive=[word_a, word_c],
    negative=[word_b],
    topn=5
)

result = model.wv.most_similar(positive=['dog', 'cat'], negative=['physics'])
print(result)

[('and', 0.3115268647670746), ('on', 0.12320028245449066), ('the', 0.09282376617193222), ('mat', 0.023975085467100143), ('pets', -0.014839722774922848), ('cats', -0.0207061804831028), ('chemistry', -0.021130019798874855), ('cute', -0.021545285359025), ('dogs', -0.08284758031368256), ('learn', -0.0889245793223381)]

Representation	Values	Purpose
One-Hot Encoding	Binary (0/1)	Presence
Bag of Words	Integer counts	Frequency
TF-IDF	Decimal	Importance

Parameter	Meaning
`vector_size`	Embedding dimension for each word
`window`	How many surrounding words to consider during training

Notebook Intro

Skip-gram Vector Arithmetic - Algorithm¶

Purpose¶

Inputs¶

Intuition¶

Steps¶

Code Snippet (Gensim)¶