Thanoss commited on
Commit
9585d12
1 Parent(s): 1271c9e

Upload desription.py

Browse files

This is a example file

Files changed (1) hide show
  1. desription.py +113 -55
desription.py CHANGED
@@ -2,100 +2,158 @@ import numpy as np
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import pickle
5
- import json
6
 
7
- class WordDescriptionModel:
8
  def __init__(self):
9
  self.vectorizer = TfidfVectorizer()
10
- self.word_descriptions = {}
11
- self.word_vectors = None
12
 
13
- def train(self, word_desc_pairs):
14
- for word, desc in word_desc_pairs:
15
- self.word_descriptions[word.lower()] = desc
16
-
17
- descriptions = list(self.word_descriptions.values())
18
- self.word_vectors = self.vectorizer.fit_transform(descriptions)
19
 
20
- def get_description(self, word, similarity_threshold=0.3):
21
- word = word.lower()
22
 
23
- if word in self.word_descriptions:
24
- return True, self.word_descriptions[word]
 
 
25
 
26
- word_vector = self.vectorizer.transform([word])
 
 
 
 
 
 
 
 
 
27
 
28
- similarities = cosine_similarity(word_vector, self.word_vectors).flatten()
 
 
 
 
 
 
 
 
 
29
 
30
- max_sim_idx = np.argmax(similarities)
 
 
 
 
31
 
32
- if similarities[max_sim_idx] >= similarity_threshold:
33
- similar_word = list(self.word_descriptions.keys())[max_sim_idx]
34
- return True, f"Similar to '{similar_word}': {self.word_descriptions[similar_word]}"
35
- else:
36
- return False, f"No description available for '{word}'. Please provide one for training."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- def add_new_word(self, word, description):
39
-
40
- word = word.lower()
41
- self.word_descriptions[word] = description
42
- # Retrain vectors with updated dataset
43
- descriptions = list(self.word_descriptions.values())
44
- self.word_vectors = self.vectorizer.fit_transform(descriptions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def save_model(self, filename):
47
-
 
 
48
  model_data = {
49
- 'word_descriptions': self.word_descriptions,
50
  'vectorizer': self.vectorizer,
51
- 'word_vectors': self.word_vectors
52
  }
53
  with open(filename, 'wb') as f:
54
  pickle.dump(model_data, f)
55
 
56
  def load_model(self, filename):
57
-
 
 
58
  try:
59
  with open(filename, 'rb') as f:
60
  model_data = pickle.load(f)
61
- self.word_descriptions = model_data['word_descriptions']
62
  self.vectorizer = model_data['vectorizer']
63
- self.word_vectors = model_data['word_vectors']
64
  return True
65
  except FileNotFoundError:
66
  return False
67
 
68
  def main():
69
- model = WordDescriptionModel()
70
- model_file = 'word_description_model.pkl'
71
 
 
72
  if not model.load_model(model_file):
73
- print("Training new model with initial data...")
74
- initial_data = [
75
- ('software', 'Computer programs and associated documentation and data that provide instructions for computers to perform specific tasks.'),
76
- ('hardware', 'Physical components that make up a computer system or electronic device.'),
77
- ('programming', 'Process of creating sets of instructions that tell a computer how to perform tasks.'),
78
- ('database', 'Organized collection of structured information or data stored electronically in a computer system.'),
79
- ('algorithm', 'Step-by-step procedure or formula for solving a problem or accomplishing a task.')
80
- ]
81
- model.train(initial_data)
82
 
83
  while True:
84
- print("\n=== Word Description System ===")
85
- word = input("Enter a word to get its description (or 'quit' to exit): ").strip()
86
 
87
- if word.lower() == 'quit':
88
  break
89
 
90
- found, description = model.get_description(word)
91
- print(f"\nResult: {description}")
92
 
93
  if not found:
94
- print("\nLet's add this word to our database!")
95
- new_description = input("Please provide a description for this word: ").strip()
96
- model.add_new_word(word, new_description)
97
- print(f"\nThank you! '{word}' has been added to the database.")
98
 
 
99
  model.save_model(model_file)
100
  print("Model has been updated and saved.")
101
 
 
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import pickle
5
+ from datasets import load_dataset
6
 
7
+ class CompanyDescriptionModel:
8
  def __init__(self):
9
  self.vectorizer = TfidfVectorizer()
10
+ self.company_descriptions = {}
11
+ self.description_vectors = None
12
 
13
+ def load_huggingface_data(self):
14
+ """
15
+ Load and process the job descriptions dataset from HuggingFace
16
+ """
17
+ print("Loading dataset from HuggingFace...")
18
+ dataset = load_dataset("jacob-hugging-face/job-descriptions")
19
 
20
+ # Process the training split
21
+ train_data = dataset['train']
22
 
23
+ # Create company-description pairs
24
+ for item in train_data:
25
+ company = item['company_name'].strip().lower()
26
+ description = item['job_description'].strip()
27
 
28
+ # If company already exists, append new description
29
+ if company in self.company_descriptions:
30
+ if isinstance(self.company_descriptions[company], list):
31
+ self.company_descriptions[company].append(description)
32
+ else:
33
+ self.company_descriptions[company] = [self.company_descriptions[company], description]
34
+ else:
35
+ self.company_descriptions[company] = description
36
+
37
+ print(f"Loaded descriptions for {len(self.company_descriptions)} companies")
38
 
39
+ # Create vectors for all descriptions
40
+ descriptions = []
41
+ for desc in self.company_descriptions.values():
42
+ if isinstance(desc, list):
43
+ # If multiple descriptions, join them
44
+ descriptions.append(" ".join(desc))
45
+ else:
46
+ descriptions.append(desc)
47
+
48
+ self.description_vectors = self.vectorizer.fit_transform(descriptions)
49
 
50
+ def get_description(self, company_name, similarity_threshold=0.3):
51
+ """
52
+ Get job descriptions for a company
53
+ """
54
+ company_name = company_name.lower().strip()
55
 
56
+ # Direct match
57
+ if company_name in self.company_descriptions:
58
+ desc = self.company_descriptions[company_name]
59
+ if isinstance(desc, list):
60
+ return True, f"Found {len(desc)} job descriptions for {company_name}:\n\n" + "\n\n---\n\n".join(desc)
61
+ return True, f"Job description for {company_name}:\n\n{desc}"
62
+
63
+ # Try to find similar company names
64
+ try:
65
+ company_vector = self.vectorizer.transform([company_name])
66
+ similarities = cosine_similarity(company_vector, self.description_vectors).flatten()
67
+ max_sim_idx = np.argmax(similarities)
68
+
69
+ if similarities[max_sim_idx] >= similarity_threshold:
70
+ similar_company = list(self.company_descriptions.keys())[max_sim_idx]
71
+ desc = self.company_descriptions[similar_company]
72
+ if isinstance(desc, list):
73
+ return True, f"Similar to '{similar_company}':\n\n" + "\n\n---\n\n".join(desc)
74
+ return True, f"Similar to '{similar_company}':\n\n{desc}"
75
+ else:
76
+ return False, f"No job descriptions found for '{company_name}'. Please provide one for training."
77
+ except Exception as e:
78
+ return False, f"Error processing company name: {str(e)}"
79
 
80
+ def add_new_description(self, company_name, description):
81
+ """
82
+ Add a new company and job description
83
+ """
84
+ company_name = company_name.lower().strip()
85
+ if company_name in self.company_descriptions:
86
+ if isinstance(self.company_descriptions[company_name], list):
87
+ self.company_descriptions[company_name].append(description)
88
+ else:
89
+ self.company_descriptions[company_name] = [self.company_descriptions[company_name], description]
90
+ else:
91
+ self.company_descriptions[company_name] = description
92
+
93
+ # Retrain vectors
94
+ descriptions = []
95
+ for desc in self.company_descriptions.values():
96
+ if isinstance(desc, list):
97
+ descriptions.append(" ".join(desc))
98
+ else:
99
+ descriptions.append(desc)
100
+
101
+ self.description_vectors = self.vectorizer.fit_transform(descriptions)
102
 
103
  def save_model(self, filename):
104
+ """
105
+ Save the model to a file
106
+ """
107
  model_data = {
108
+ 'company_descriptions': self.company_descriptions,
109
  'vectorizer': self.vectorizer,
110
+ 'description_vectors': self.description_vectors
111
  }
112
  with open(filename, 'wb') as f:
113
  pickle.dump(model_data, f)
114
 
115
  def load_model(self, filename):
116
+ """
117
+ Load the model from a file
118
+ """
119
  try:
120
  with open(filename, 'rb') as f:
121
  model_data = pickle.load(f)
122
+ self.company_descriptions = model_data['company_descriptions']
123
  self.vectorizer = model_data['vectorizer']
124
+ self.description_vectors = model_data['description_vectors']
125
  return True
126
  except FileNotFoundError:
127
  return False
128
 
129
  def main():
130
+ model = CompanyDescriptionModel()
131
+ model_file = 'company_description_model.pkl'
132
 
133
+ # Try to load existing model, if not found, load from HuggingFace
134
  if not model.load_model(model_file):
135
+ print("No existing model found. Loading data from HuggingFace...")
136
+ model.load_huggingface_data()
137
+ model.save_model(model_file)
138
+ print("Initial model created and saved.")
 
 
 
 
 
139
 
140
  while True:
141
+ print("\n=== Company Job Description System ===")
142
+ company = input("Enter a company name to get job descriptions (or 'quit' to exit): ").strip()
143
 
144
+ if company.lower() == 'quit':
145
  break
146
 
147
+ found, description = model.get_description(company)
148
+ print(f"\nResult:\n{description}")
149
 
150
  if not found:
151
+ print("\nLet's add this company to our database!")
152
+ new_description = input("Please provide a job description for this company: ").strip()
153
+ model.add_new_description(company, new_description)
154
+ print(f"\nThank you! Job description for '{company}' has been added to the database.")
155
 
156
+ # Save the updated model
157
  model.save_model(model_file)
158
  print("Model has been updated and saved.")
159