hf-similarity-check / search_engine.py
Mitul Mohammad Abdullah Al Mukit
updates
e029c8d
import base64
import os
import rsa
from datetime import date
import secrets
import string
import requests
import json
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import csv
def generate_token_id(length):
characters = string.ascii_letters + string.digits # + string.punctuation
token = ''.join(secrets.choice(characters) for _ in range(length))
return token
# Examples for what will be generated
# 5!bA9H2f1q^...
# Xe7uM$4d9@...
# &3yTb1*8Z#...
# %pWqN7!6zX...
# @9oV!s6Rd2...
def get_today_date():
today = date.today()
return str(today)
# Example for what will be returned
# 2023-06-29
def get_request(get_url, params):
# get_url = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT'
pubkey_path = os.path.join(os.path.dirname(__file__), '..', 'pubkey.pem')
with open(pubkey_path, 'rb') as f:
pubKey = rsa.PublicKey.load_pkcs1(f.read())
for key, value in params.items():
value_bytes = value.encode("utf-8")
encrypted_value = rsa.encrypt(value_bytes, pubKey)
encoded_value = base64.b64encode(encrypted_value)
params[key] = encoded_value
# Write the encrypted and encoded values to a file
with open("sbt_request.txt", "w") as f:
for key, value in params.items():
f.write(f"{key}: {value}\n\n")
# posting Json file to api
r = requests.get(get_url, params=params)# extracting data in json format
data = r.json()
print(f'get request: {data}')
def post_request(post_url, data):
# post_url = 'http://ipygg-api-test-env.ap-east-1.elasticbeanstalk.com/SBT'
pubkey_path = os.path.join(os.path.dirname(__file__), '..', 'pubkey.pem')
with open(pubkey_path, 'rb') as f:
pubKey = rsa.PublicKey.load_pkcs1(f.read())
for key, value in data.items():
value_bytes = value.encode("utf-8")
encrypted_value = rsa.encrypt(value_bytes, pubKey)
encoded_value = base64.b64encode(encrypted_value)
data[key] = encoded_value
# Write the encrypted and encoded values to a file
with open("sbt_request.txt", "w") as f:
for key, value in data.items():
f.write(f"{key}: {value}\n\n")
# posting Json file to api
r = requests.post(post_url, data=data)
print(f'post request: {r.json}')
def search_on_engine(search_data):
# clearing csv file data
csv_filename = "search_result.csv"
f = open(csv_filename, "w+") # opening the file with w+ mode truncates the file
# writer.writeheader()
f.close()
# set up driver
driver = webdriver.Chrome()
load_dotenv()
search_url = os.environ.get("search-engine-url")
driver.get(search_url)
# assert "Python" in driver.title
for search_tag_key in search_data:
search_tag = search_data[search_tag_key]
# Searching search tag in query
elem = driver.find_element(By.NAME, "query")
elem.clear()
elem.send_keys(search_tag)
submit_elem = driver.find_element(By.ID, "submit")
submit_elem.click()
time.sleep(8) # delay for 10 seconds: 2 seconds for Yahoo! search, 6 seconds for Google search
assert "No results found." not in driver.page_source
# extract link from page
html = driver.page_source # search url remains the same after searching
# time.sleep(10) # delay for 10 seconds: 2 seconds for Yahoo! search, 6 seconds for Google search
soup = BeautifulSoup(html, 'html.parser')
result_links = []
for link in soup.find_all('a'):
result_links.append(link.get('href'))
# print(link.get('href'))
# store in csv
result_dict = {
"tag": search_tag,
"results": result_links
}
with open(csv_filename, 'a', encoding='UTF8', newline='') as f:
# writer = csv.writer(f)
writer = csv.DictWriter(f, fieldnames=["tag","results"])
writer.writerow(result_dict)
# save the link to db
# post_request("post_url", result_link)
def get_data_link(chinese_name, english_name, address):
# get phone number from db
# phone_no = get_request(get_url, english_name)
phone_no = '12345678'
# create data set
search_data = {
"chi_name": chinese_name,
"eng_name": english_name,
"address": address,
"phone_number": phone_no
}
# pass data to search engine
result_link = search_on_engine(search_data)
# show the link
# return result_link
# get_data_link('劉柏政','Lau, Pak Ching','FLT 1939 18/F KM WAN HSE, CHOIHUNG EST 1 HUNG MUI AVENUE, NGAU CHI WAN KLN')
# search_on_engine('Lau, Pak Ching')