File size: 3,124 Bytes
58c2772
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import pandas as pd
import tempfile
from bertopic import BERTopic
from src.reddit import RedditBot
from flask import Blueprint, render_template, request, send_file, redirect, url_for, send_from_directory

DOWNLOADS_PATH = os.path.join(os.getcwd(), 'downloads')

views = Blueprint(__name__, 'views')
reddit = RedditBot()
topic_model = BERTopic()


def retrieve_subreddits(data: dict) -> pd.DataFrame:
    # Retrieve subreddits through its API
    posts = reddit.get_subreddits_posts(
        name=data.get('subreddit'), 
        type=data.get('type'), 
        amount=int(data.get('amount'))
    )
    df = reddit.convert_posts_to_df(posts=posts)
    df['Text'] = df.apply(lambda row: row.Title + ': ' + row.Content, axis=1)
    return df

@views.route('/', methods=['POST', 'GET'])
def home():
    data = request.form
    if request.method == 'POST':
        if (int(data.get('amount')) < 0 or int(data.get('amount')) > 1000):
            return redirect(url_for('views.error', type_of_error='amount'))
        elif data.get('type') not in ['hot', 'new', 'rising', 'top']:
            print(data.get('type'))
            return redirect(url_for('views.error', type_of_error='type'))
        elif not reddit.subreddit_exists(data.get('subreddit')):
            return redirect(url_for('views.error', type_of_error='subreddit'))
        else:
            # Retrieve subreddits
            subreddits_df = retrieve_subreddits(data=data)
            # Topic modelling using BERTtopic
            _, _ = topic_model.fit_transform(subreddits_df.Text)
            topics_df = topic_model.get_topic_info()
            for t in topics_df.Topic:
                topics_df.loc[topics_df.Topic == t, 'Top words'] = str([w for w, p in topic_model.get_topic(t)])
            # Donwload topics
            topics_df.to_csv(os.path.join(DOWNLOADS_PATH, 'topics.csv'), index=False)
            send_from_directory(
                directory = DOWNLOADS_PATH,
                path = 'topics.csv',
                as_attachment=True,
            )
            # Download docs info
            docs_df = topic_model.get_document_info(subreddits_df.Text)
            docs_df.to_csv(os.path.join(DOWNLOADS_PATH, 'docs_with_topics_info.csv'), index=False)
            send_from_directory(
                directory = DOWNLOADS_PATH,
                path = 'docs_with_topics_info.csv',
                as_attachment=True,
            )
            return redirect(url_for('views.success'))

    return render_template('index.html')

@views.route('/succes', methods=['GET'])
def success():
    return render_template('success.html')

@views.route('/error/<type_of_error>', methods=['GET'])
def error(type_of_error: str):
    if type_of_error == 'amount':
        return render_template('error.html', type_of_error='The amount is higher than 1000 or lower than 0')
    elif type_of_error == 'type':
        return render_template('error.html', type_of_error='The ordering is not within hot, rising, new, top')
    elif type_of_error == 'subreddit':
        return render_template('error.html', type_of_error='The subreddit does not exist')