Spaces:
Sleeping
Sleeping
File size: 5,676 Bytes
afadb57 9045f37 112b798 9045f37 afadb57 e0fb16c afadb57 e0fb16c afadb57 e0fb16c afadb57 e0fb16c afadb57 e0fb16c afadb57 e0fb16c afadb57 9045f37 afadb57 e0fb16c afadb57 e0fb16c afadb57 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 fdf715a 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 afadb57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.preprocessing import normalize
import re
def get_next_version(file_prefix, folder='RecommendationFiles/'):
"""Find the latest version of a file and return the next version's filename."""
if not os.path.exists(folder):
os.makedirs(folder) # Ensure the folder exists
# Regular expression to match files like 'file_0001.joblib'
pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
files = [f for f in os.listdir(folder) if pattern.match(f)]
# Extract version numbers from matching files
versions = [int(pattern.match(f).group(1)) for f in files]
# Determine the next version number
next_version = max(versions) + 1 if versions else 1
# Return the next version filename with the folder path
return os.path.join(folder, f"{file_prefix}_{next_version:04d}.joblib")
def get_latest_version(file_prefix, folder='RecommendationFiles/'):
"""Find the latest version of a file to load."""
if not os.path.exists(folder):
raise FileNotFoundError(f"Folder '{folder}' does not exist")
# Regular expression to match files like 'file_0001.joblib'
pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
files = [f for f in os.listdir(folder) if pattern.match(f)]
# Extract version numbers from matching files
versions = [int(pattern.match(f).group(1)) for f in files]
if versions:
latest_version = max(versions)
return os.path.join(folder, f"{file_prefix}_{latest_version:04d}.joblib")
else:
raise FileNotFoundError(f"No versions found for {file_prefix} in folder '{folder}'")
def recomienda_tf(new_basket, cestas, productos):
# Get the latest versions of the matrix and vectorizer from the folder
tf_matrix_file = get_latest_version('count_matrix')
count_vectorizer_file = get_latest_version('count_vectorizer')
# Load the matrix TF and the vectorizer
tf_matrix = load(tf_matrix_file)
count = load(count_vectorizer_file)
# Convert the new basket into TF (Term Frequency) format
new_basket_str = ' '.join(new_basket)
new_basket_vector = count.transform([new_basket_str])
new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
# Compare the new basket with previous ones
similarities = cosine_similarity(new_basket_tf, tf_matrix)
# Get the indices of the most similar baskets
similar_indices = similarities.argsort()[0][-4:] # Top 4 most similar baskets
# Create a dictionary to count recommendations
recommendations_count = {}
total_similarity = 0
# Recommend products from similar baskets
for idx in similar_indices:
sim_score = similarities[0][idx]
total_similarity += sim_score # Sum of similarities
products = cestas.iloc[idx]['Cestas'].split()
unique_products = set(products) # Use a set to get unique products
for product in unique_products:
if product.strip() not in new_basket: # Avoid recommending items already in the basket
recommendations_count[product.strip()] = recommendations_count.get(product.strip(), 0) + sim_score
# Calculate the relative probability of each recommended product
recommendations_with_prob = []
if total_similarity > 0:
recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
else:
print("No se encontraron similitudes suficientes para calcular probabilidades.")
# Sort recommendations by relevance score
recommendations_with_prob.sort(key=lambda x: x[1], reverse=True)
# Create a new DataFrame to store recommendations
recommendations_data = []
for product, score in recommendations_with_prob:
# Search for the product description in the products DataFrame
description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
if not description.empty:
recommendations_data.append({
'ARTICULO': product,
'DESCRIPCION': description.values[0],
'RELEVANCIA': score
})
recommendations_df = pd.DataFrame(recommendations_data)
return recommendations_df.head(5)
def retroalimentacion(cestas, cesta_nueva):
# Convert basket from list to string
cesta_unida = ' '.join(cesta_nueva)
# Add the new basket to the historical baskets if it doesn't already exist
if not cestas['Cestas'].isin([cesta_unida]).any():
cestas.loc[len(cestas)] = cesta_unida
print("Cesta añadida.")
# Re-save the updated baskets DataFrame
cestas.to_csv('RecommendationFiles/cestas_final.csv', index=False)
else:
print("La cesta ya existe en el DataFrame.")
# Re-vectorize the basket DataFrame
count_vectorizer = CountVectorizer()
count_vectorizer.fit(cestas['Cestas'])
count_matrix = count_vectorizer.transform(cestas['Cestas'])
tf_matrix = normalize(count_matrix, norm='l1')
# Save new versions of the vectorizer and matrix
count_vectorizer_file = get_next_version('count_vectorizer')
tf_matrix_file = get_next_version('tf_matrix')
dump(count_vectorizer, count_vectorizer_file)
dump(tf_matrix, tf_matrix_file)
return None |