Spaces:
Runtime error
Runtime error
:tada: initial commit
Browse files
app.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import plotly.graph_objects as go
|
3 |
+
import numpy as np
|
4 |
+
import scipy.integrate as integrate
|
5 |
+
|
6 |
+
def _false_positive_probability(threshold, b, r):
|
7 |
+
def _probability(s):
|
8 |
+
return 1 - (1 - s ** float(r)) ** float(b)
|
9 |
+
a, err = integrate.quad(_probability, 0.0, threshold)
|
10 |
+
return a
|
11 |
+
|
12 |
+
|
13 |
+
def _false_negative_probability(threshold, b, r):
|
14 |
+
def _probability(s):
|
15 |
+
return 1 - (1 - (1 - s ** float(r)) ** float(b))
|
16 |
+
|
17 |
+
a, err = integrate.quad(_probability, threshold, 1.0)
|
18 |
+
return a
|
19 |
+
|
20 |
+
|
21 |
+
def _optimal_param(threshold, num_perm, false_positive_weight, false_negative_weight):
|
22 |
+
"""
|
23 |
+
Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
|
24 |
+
of probabilities of false positive and false negative.
|
25 |
+
"""
|
26 |
+
min_error = float("inf")
|
27 |
+
opt = (0, 0)
|
28 |
+
for b in range(1, num_perm + 1):
|
29 |
+
max_r = int(num_perm / b)
|
30 |
+
for r in range(1, max_r + 1):
|
31 |
+
fp = _false_positive_probability(threshold, b, r)
|
32 |
+
fn = _false_negative_probability(threshold, b, r)
|
33 |
+
error = fp * false_positive_weight + fn * false_negative_weight
|
34 |
+
if error < min_error:
|
35 |
+
min_error = error
|
36 |
+
opt = (b, r)
|
37 |
+
return opt
|
38 |
+
|
39 |
+
|
40 |
+
col1, col2 = st.columns(2)
|
41 |
+
s = col1.slider("Select a Jaccard similarity", 0.0, 1.0, 0.1)
|
42 |
+
p = col2.slider("Select a number of permutations", 0, 1000, 10)
|
43 |
+
optimal_b, optimal_r = _optimal_param(s, p, 1, 1)
|
44 |
+
|
45 |
+
b = col1.slider("Select a number of bands", 1, 100, 1)
|
46 |
+
r = col2.slider("Select a number of rows per band", 1, 100, 1)
|
47 |
+
|
48 |
+
col1.metric(label="Optimal number of bands", value=optimal_b)
|
49 |
+
col2.metric(label="Optimal number of rows per band", value=optimal_r)
|
50 |
+
|
51 |
+
st.markdown("---")
|
52 |
+
|
53 |
+
st.markdown(f"Two documents that have a Jaccard similarity of $s={s}$ will have:")
|
54 |
+
st.markdown(f"1. ${s * 100:.2f}\%$ of their k-shingles will be the same")
|
55 |
+
st.markdown(f"2. ${s * 100:.2f}\%$ of their k-shingles' hashes will be the same")
|
56 |
+
st.markdown(f"4. ${s * 100:.2f}\%$ of the time, a particular hash will be the same for two documents")
|
57 |
+
st.markdown(
|
58 |
+
f"3. $s^r={100 * s ** r:.2f}\%$ of the time, they will have the same hashes for a particular band of $r={r}$ rows"
|
59 |
+
)
|
60 |
+
st.markdown(
|
61 |
+
f"5. $1 - s^r = {100 * (1 - s ** r):.2f}\%$ of the time, they will have at least one different hash for a particular band"
|
62 |
+
)
|
63 |
+
st.markdown(
|
64 |
+
f"6. $(1 - s^r)^b = {100 * (1 - s ** r)**b:.2f}\%$ of the time, they will have at least one different hash for all $b={b}$ bands"
|
65 |
+
)
|
66 |
+
st.markdown(
|
67 |
+
f"7. $1 - (1 - s^r)^b={100 * (1 - (1 - s ** r)**b):.2f}\%$ of the time, they will have at least one band with the same hashes"
|
68 |
+
)
|
69 |
+
|
70 |
+
t = st.slider("Select a Jaccard similarity threshold", 0.0, 1.0, 0.1)
|
71 |
+
|
72 |
+
x = np.linspace(0, 1, 1000)
|
73 |
+
y = 1 - (1 - x**r) ** b
|
74 |
+
|
75 |
+
fig = go.Figure(
|
76 |
+
data=go.Scatter(
|
77 |
+
x=x,
|
78 |
+
y=y,
|
79 |
+
showlegend=False,
|
80 |
+
)
|
81 |
+
)
|
82 |
+
fig = fig.add_shape(
|
83 |
+
type="line",
|
84 |
+
x0=t,
|
85 |
+
y0=0,
|
86 |
+
x1=t,
|
87 |
+
y1=1,
|
88 |
+
line=dict(
|
89 |
+
color="Red",
|
90 |
+
width=4,
|
91 |
+
),
|
92 |
+
)
|
93 |
+
false_positive_x = [d for d in x if d <= t] + [t]
|
94 |
+
false_positive_y = [d for i, d in enumerate(y) if x[i] <= t] + [0]
|
95 |
+
fig.add_trace(
|
96 |
+
go.Scatter(
|
97 |
+
x=false_positive_x,
|
98 |
+
y=false_positive_y,
|
99 |
+
fill="tozeroy",
|
100 |
+
fillcolor="rgba(255, 0, 0, 0.2)",
|
101 |
+
line_color="rgba(255, 0, 0, 0)",
|
102 |
+
showlegend=False,
|
103 |
+
)
|
104 |
+
)
|
105 |
+
|
106 |
+
false_negative_x = [d for d in x if d > t]
|
107 |
+
false_negative_y = [d for i, d in enumerate(y) if x[i] > t]
|
108 |
+
fig.add_trace(
|
109 |
+
go.Scatter(
|
110 |
+
x=[t] + false_negative_x + [1],
|
111 |
+
y=[1] + false_negative_y + [1],
|
112 |
+
fill="toself",
|
113 |
+
fillcolor="rgba(0, 255, 0, 0.2)",
|
114 |
+
line_color="rgba(0, 255, 0, 0)",
|
115 |
+
showlegend=False,
|
116 |
+
)
|
117 |
+
)
|
118 |
+
|
119 |
+
st.plotly_chart(fig)
|
120 |
+
|
121 |
+
false_positive = integrate.quad(lambda x: 1 - (1 - x**r) ** b, 0, t)[0]
|
122 |
+
false_negative = integrate.quad(lambda x: (1 - x**r) ** b, t, 1)[0]
|
123 |
+
|
124 |
+
cols = st.columns(2)
|
125 |
+
cols[0].metric(label="False positive area", value=f"{false_positive:.2f}")
|
126 |
+
cols[1].metric(label="False negative area", value=f"{false_negative:.2f}")
|