Thanoss
/

Description_Maker

@@ -2,100 +2,158 @@ import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import pickle
-import json
-class WordDescriptionModel:
     def __init__(self):
         self.vectorizer = TfidfVectorizer()
-        self.word_descriptions = {}
-        self.word_vectors = None
-    def train(self, word_desc_pairs):
-        for word, desc in word_desc_pairs:
-            self.word_descriptions[word.lower()] = desc
-        descriptions = list(self.word_descriptions.values())
-        self.word_vectors = self.vectorizer.fit_transform(descriptions)
-    def get_description(self, word, similarity_threshold=0.3):
-        word = word.lower()
-        if word in self.word_descriptions:
-            return True, self.word_descriptions[word]
-        word_vector = self.vectorizer.transform([word])
-        similarities = cosine_similarity(word_vector, self.word_vectors).flatten()
-        max_sim_idx = np.argmax(similarities)
-        if similarities[max_sim_idx] >= similarity_threshold:
-            similar_word = list(self.word_descriptions.keys())[max_sim_idx]
-            return True, f"Similar to '{similar_word}': {self.word_descriptions[similar_word]}"
-        else:
-            return False, f"No description available for '{word}'. Please provide one for training."
-    def add_new_word(self, word, description):
-        word = word.lower()
-        self.word_descriptions[word] = description
-        # Retrain vectors with updated dataset
-        descriptions = list(self.word_descriptions.values())
-        self.word_vectors = self.vectorizer.fit_transform(descriptions)
     def save_model(self, filename):
         model_data = {
-            'word_descriptions': self.word_descriptions,
             'vectorizer': self.vectorizer,
-            'word_vectors': self.word_vectors
         }
         with open(filename, 'wb') as f:
             pickle.dump(model_data, f)
     def load_model(self, filename):
         try:
             with open(filename, 'rb') as f:
                 model_data = pickle.load(f)
-                self.word_descriptions = model_data['word_descriptions']
                 self.vectorizer = model_data['vectorizer']
-                self.word_vectors = model_data['word_vectors']
             return True
         except FileNotFoundError:
             return False
 def main():
-    model = WordDescriptionModel()
-    model_file = 'word_description_model.pkl'
     if not model.load_model(model_file):
-        print("Training new model with initial data...")
-        initial_data = [
-            ('software', 'Computer programs and associated documentation and data that provide instructions for computers to perform specific tasks.'),
-            ('hardware', 'Physical components that make up a computer system or electronic device.'),
-            ('programming', 'Process of creating sets of instructions that tell a computer how to perform tasks.'),
-            ('database', 'Organized collection of structured information or data stored electronically in a computer system.'),
-            ('algorithm', 'Step-by-step procedure or formula for solving a problem or accomplishing a task.')
-        ]
-        model.train(initial_data)
     while True:
-        print("\n=== Word Description System ===")
-        word = input("Enter a word to get its description (or 'quit' to exit): ").strip()
-        if word.lower() == 'quit':
             break
-        found, description = model.get_description(word)
-        print(f"\nResult: {description}")
         if not found:
-            print("\nLet's add this word to our database!")
-            new_description = input("Please provide a description for this word: ").strip()
-            model.add_new_word(word, new_description)
-            print(f"\nThank you! '{word}' has been added to the database.")
             model.save_model(model_file)
             print("Model has been updated and saved.")

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import pickle
+from datasets import load_dataset
+class CompanyDescriptionModel:
     def __init__(self):
         self.vectorizer = TfidfVectorizer()
+        self.company_descriptions = {}
+        self.description_vectors = None
+    def load_huggingface_data(self):
+        """
+        Load and process the job descriptions dataset from HuggingFace
+        """
+        print("Loading dataset from HuggingFace...")
+        dataset = load_dataset("jacob-hugging-face/job-descriptions")
+        # Process the training split
+        train_data = dataset['train']
+        # Create company-description pairs
+        for item in train_data:
+            company = item['company_name'].strip().lower()
+            description = item['job_description'].strip()
+            # If company already exists, append new description
+            if company in self.company_descriptions:
+                if isinstance(self.company_descriptions[company], list):
+                    self.company_descriptions[company].append(description)
+                else:
+                    self.company_descriptions[company] = [self.company_descriptions[company], description]
+            else:
+                self.company_descriptions[company] = description
+        print(f"Loaded descriptions for {len(self.company_descriptions)} companies")
+        # Create vectors for all descriptions
+        descriptions = []
+        for desc in self.company_descriptions.values():
+            if isinstance(desc, list):
+                # If multiple descriptions, join them
+                descriptions.append(" ".join(desc))
+            else:
+                descriptions.append(desc)
+        self.description_vectors = self.vectorizer.fit_transform(descriptions)
+    def get_description(self, company_name, similarity_threshold=0.3):
+        """
+        Get job descriptions for a company
+        """
+        company_name = company_name.lower().strip()
+        # Direct match
+        if company_name in self.company_descriptions:
+            desc = self.company_descriptions[company_name]
+            if isinstance(desc, list):
+                return True, f"Found {len(desc)} job descriptions for {company_name}:\n\n" + "\n\n---\n\n".join(desc)
+            return True, f"Job description for {company_name}:\n\n{desc}"
+        # Try to find similar company names
+        try:
+            company_vector = self.vectorizer.transform([company_name])
+            similarities = cosine_similarity(company_vector, self.description_vectors).flatten()
+            max_sim_idx = np.argmax(similarities)
+            if similarities[max_sim_idx] >= similarity_threshold:
+                similar_company = list(self.company_descriptions.keys())[max_sim_idx]
+                desc = self.company_descriptions[similar_company]
+                if isinstance(desc, list):
+                    return True, f"Similar to '{similar_company}':\n\n" + "\n\n---\n\n".join(desc)
+                return True, f"Similar to '{similar_company}':\n\n{desc}"
+            else:
+                return False, f"No job descriptions found for '{company_name}'. Please provide one for training."
+        except Exception as e:
+            return False, f"Error processing company name: {str(e)}"
+    def add_new_description(self, company_name, description):
+        """
+        Add a new company and job description
+        """
+        company_name = company_name.lower().strip()
+        if company_name in self.company_descriptions:
+            if isinstance(self.company_descriptions[company_name], list):
+                self.company_descriptions[company_name].append(description)
+            else:
+                self.company_descriptions[company_name] = [self.company_descriptions[company_name], description]
+        else:
+            self.company_descriptions[company_name] = description
+        # Retrain vectors
+        descriptions = []
+        for desc in self.company_descriptions.values():
+            if isinstance(desc, list):
+                descriptions.append(" ".join(desc))
+            else:
+                descriptions.append(desc)
+        self.description_vectors = self.vectorizer.fit_transform(descriptions)
     def save_model(self, filename):
+        """
+        Save the model to a file
+        """
         model_data = {
+            'company_descriptions': self.company_descriptions,
             'vectorizer': self.vectorizer,
+            'description_vectors': self.description_vectors
         }
         with open(filename, 'wb') as f:
             pickle.dump(model_data, f)
     def load_model(self, filename):
+        """
+        Load the model from a file
+        """
         try:
             with open(filename, 'rb') as f:
                 model_data = pickle.load(f)
+                self.company_descriptions = model_data['company_descriptions']
                 self.vectorizer = model_data['vectorizer']
+                self.description_vectors = model_data['description_vectors']
             return True
         except FileNotFoundError:
             return False
 def main():
+    model = CompanyDescriptionModel()
+    model_file = 'company_description_model.pkl'
+    # Try to load existing model, if not found, load from HuggingFace
     if not model.load_model(model_file):
+        print("No existing model found. Loading data from HuggingFace...")
+        model.load_huggingface_data()
+        model.save_model(model_file)
+        print("Initial model created and saved.")
     while True:
+        print("\n=== Company Job Description System ===")
+        company = input("Enter a company name to get job descriptions (or 'quit' to exit): ").strip()
+        if company.lower() == 'quit':
             break
+        found, description = model.get_description(company)
+        print(f"\nResult:\n{description}")
         if not found:
+            print("\nLet's add this company to our database!")
+            new_description = input("Please provide a job description for this company: ").strip()
+            model.add_new_description(company, new_description)
+            print(f"\nThank you! Job description for '{company}' has been added to the database.")
+            # Save the updated model
             model.save_model(model_file)
             print("Model has been updated and saved.")