import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pickle from datasets import load_dataset class CompanyDescriptionModel: def __init__(self): self.vectorizer = TfidfVectorizer() self.company_descriptions = {} self.description_vectors = None def load_huggingface_data(self): """ Load and process the job descriptions dataset from HuggingFace """ print("Loading dataset from HuggingFace...") dataset = load_dataset("jacob-hugging-face/job-descriptions") # Process the training split train_data = dataset['train'] # Create company-description pairs for item in train_data: company = item['company_name'].strip().lower() description = item['job_description'].strip() # If company already exists, append new description if company in self.company_descriptions: if isinstance(self.company_descriptions[company], list): self.company_descriptions[company].append(description) else: self.company_descriptions[company] = [self.company_descriptions[company], description] else: self.company_descriptions[company] = description print(f"Loaded descriptions for {len(self.company_descriptions)} companies") # Create vectors for all descriptions descriptions = [] for desc in self.company_descriptions.values(): if isinstance(desc, list): # If multiple descriptions, join them descriptions.append(" ".join(desc)) else: descriptions.append(desc) self.description_vectors = self.vectorizer.fit_transform(descriptions) def get_description(self, company_name, similarity_threshold=0.3): """ Get job descriptions for a company """ company_name = company_name.lower().strip() # Direct match if company_name in self.company_descriptions: desc = self.company_descriptions[company_name] if isinstance(desc, list): return True, f"Found {len(desc)} job descriptions for {company_name}:\n\n" + "\n\n---\n\n".join(desc) return True, f"Job description for {company_name}:\n\n{desc}" # Try to find similar company names try: company_vector = self.vectorizer.transform([company_name]) similarities = cosine_similarity(company_vector, self.description_vectors).flatten() max_sim_idx = np.argmax(similarities) if similarities[max_sim_idx] >= similarity_threshold: similar_company = list(self.company_descriptions.keys())[max_sim_idx] desc = self.company_descriptions[similar_company] if isinstance(desc, list): return True, f"Similar to '{similar_company}':\n\n" + "\n\n---\n\n".join(desc) return True, f"Similar to '{similar_company}':\n\n{desc}" else: return False, f"No job descriptions found for '{company_name}'. Please provide one for training." except Exception as e: return False, f"Error processing company name: {str(e)}" def add_new_description(self, company_name, description): """ Add a new company and job description """ company_name = company_name.lower().strip() if company_name in self.company_descriptions: if isinstance(self.company_descriptions[company_name], list): self.company_descriptions[company_name].append(description) else: self.company_descriptions[company_name] = [self.company_descriptions[company_name], description] else: self.company_descriptions[company_name] = description # Retrain vectors descriptions = [] for desc in self.company_descriptions.values(): if isinstance(desc, list): descriptions.append(" ".join(desc)) else: descriptions.append(desc) self.description_vectors = self.vectorizer.fit_transform(descriptions) def save_model(self, filename): """ Save the model to a file """ model_data = { 'company_descriptions': self.company_descriptions, 'vectorizer': self.vectorizer, 'description_vectors': self.description_vectors } with open(filename, 'wb') as f: pickle.dump(model_data, f) def load_model(self, filename): """ Load the model from a file """ try: with open(filename, 'rb') as f: model_data = pickle.load(f) self.company_descriptions = model_data['company_descriptions'] self.vectorizer = model_data['vectorizer'] self.description_vectors = model_data['description_vectors'] return True except FileNotFoundError: return False def main(): model = CompanyDescriptionModel() model_file = 'company_description_model.pkl' # Try to load existing model, if not found, load from HuggingFace if not model.load_model(model_file): print("No existing model found. Loading data from HuggingFace...") model.load_huggingface_data() model.save_model(model_file) print("Initial model created and saved.") while True: print("\n=== Company Job Description System ===") company = input("Enter a company name to get job descriptions (or 'quit' to exit): ").strip() if company.lower() == 'quit': break found, description = model.get_description(company) print(f"\nResult:\n{description}") if not found: print("\nLet's add this company to our database!") new_description = input("Please provide a job description for this company: ").strip() model.add_new_description(company, new_description) print(f"\nThank you! Job description for '{company}' has been added to the database.") # Save the updated model model.save_model(model_file) print("Model has been updated and saved.") if __name__ == "__main__": main()