Files
aii-monorepo/exercises/information_retrieval/ej4/main.py

247 lines
10 KiB
Python

import locale
import re
import urllib.request
from datetime import datetime
from pathlib import Path
import tkinter as tk
from tkinter import messagebox, ttk
from tkinter import Tk
from tkinter.scrolledtext import ScrolledText
import shutil, re, os
from bs4 import BeautifulSoup
from whoosh.index import create_in,open_dir
from whoosh.fields import Schema, TEXT, DATETIME, KEYWORD, ID, NUMERIC
from whoosh.qparser import QueryParser
from whoosh import index, qparser, query
BASE_URL = "https://recetas.elperiodico.com"
RECIPES_URL = BASE_URL + "/Recetas-de-Aperitivos-tapas-listado_receta-1_1.html"
DATA_DIR = Path(__file__).parent.parent / "index"
def init_ssl():
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
def create_index():
if not os.path.exists(DATA_DIR):
os.mkdir(DATA_DIR)
if not index.exists_in(DATA_DIR, indexname="RecipesIndex"):
schema = Schema(
title=TEXT(stored=True),
difficulty=TEXT(stored=True),
duration=TEXT(stored=True),
units=NUMERIC(stored=True, numtype=int),
author=ID(stored=True),
updated_at=DATETIME(stored=True),
features=KEYWORD(stored=True, commas=True),
intro=TEXT(stored=True)
)
idx = create_in(DATA_DIR, schema=schema, indexname="RecipesIndex")
print(f"Created index: {idx.indexname}")
else:
print(f"An index already exists")
def parse_duration(duration):
if not duration:
return None
duration = duration.strip().lower()
hours = 0
minutes = 0
h_match = re.search(r"(\d+)h", duration)
m_match = re.search(r"(\d+)m", duration)
if h_match:
hours = int(h_match.group(1))
if m_match:
minutes = int(m_match.group(1))
return hours * 60 + minutes
def parse_duration_inverse(minutes):
if minutes is None:
return None
m = minutes % 60
h = (minutes - m) // 60
return f"{h}h {m}m" if h != 0 else f"{m}m"
def persist_recipes():
idx = index.open_dir(DATA_DIR, "RecipesIndex")
writer = idx.writer()
count = 0
f = urllib.request.urlopen(RECIPES_URL)
bs = BeautifulSoup(f, "lxml")
results = bs.find_all("div", attrs={"data-js-selector": "resultado"})
for div in results:
title_a = div.a
title = div.a.string.strip()
info_div = div.find("div", class_="info_snippet")
difficulty = info_div.find("span").get_text(strip=True) if info_div and info_div.find("span") else "Unknown"
intro = div.find("div", class_="intro").get_text()
properties = div.find("div", class_="properties")
duration = properties.find("span", class_="duracion").string.strip() if properties and properties.find("span", class_="duracion") else "Unknown"
units = int(properties.find("span", class_="unidades").string.strip()) if properties and properties.find("span", class_="unidades") else -1
details_link = title_a["href"]
f2 = urllib.request.urlopen(details_link)
bs2 = BeautifulSoup(f2, "lxml")
details = bs2.find("div", class_="autor").find("div", class_="nombre_autor")
author = details.find("a").string
date_str = details.find("span").string.replace("Actualizado: ", "")
updated_at = datetime.strptime(date_str, "%d %B %Y")
features = bs2.find("div", class_=["properties", "inline"]).get_text(strip=True).replace("Características adicionales:", "") if bs2.find("div", class_=["properties", "inline"]) else "Unknown"
writer.add_document(
title=title,
difficulty=difficulty,
duration=duration,
units=units,
author=author,
updated_at=updated_at,
features=features,
intro=intro
)
count += 1
writer.commit()
return count
class RecipesUI():
def __init__(self, root, title = "AII"):
self.root = root
self.root.title(title)
self.root.geometry("900x600")
# Menu Principal
self.menu = tk.Menu(self.root)
self.root.config(menu=self.menu)
# Menu Datos
datos_menu = tk.Menu(self.menu, tearoff=0)
datos_menu.add_command(label="Cargar", command=lambda: self.callback("load"))
datos_menu.add_command(label="Listar", command=lambda: self.callback("list_recipes"))
datos_menu.add_separator()
datos_menu.add_command(label="Salir", command=self.root.quit)
self.menu.add_cascade(label="Datos", menu=datos_menu)
# Menu Buscar
buscar_menu = tk.Menu(self.menu, tearoff=0)
buscar_menu.add_command(label="Título o Introducción", command=lambda: self.callback("search_title_or_intro"))
buscar_menu.add_command(label="Fecha", command=lambda: self.callback("search_updated_at"))
buscar_menu.add_command(label="Características y Título", command=lambda: self.callback("search_features_and_title"))
self.menu.add_cascade(label="Buscar", menu=buscar_menu)
# Callback externo desde el punto de entrada
self.callback = None
def show_list(self, items, fields, title="Listado"):
mw = tk.Toplevel(self.root)
mw.title(title)
listbox = tk.Listbox(mw, width=80, height=20)
listbox.pack(side="left", fill="both", expand=True)
scrollbar = tk.Scrollbar(mw)
scrollbar.pack(side="right", fill="y")
listbox.config(yscrollcommand=scrollbar.set)
scrollbar.config(command=listbox.yview)
for item in items:
row = " | ".join(str(item.get(field, "Unknown")) for field in fields)
listbox.insert("end", row)
def ask_text(self, label, callback):
mw = tk.Toplevel(self.root)
mw.title(label)
tk.Label(mw, text=label).pack(pady=5)
entry = ttk.Entry(mw)
entry.pack(pady=5)
ttk.Button(mw, text="Aceptar", command=
lambda: [callback(entry.get()), mw.destroy()]).pack(pady=10)
def ask_spinbox(self, label, options, callback):
mw = tk.Toplevel(self.root)
mw.title(label)
tk.Label(mw, text=label).pack(pady=5)
spinbox = ttk.Spinbox(mw, values=options, state="readonly", width=40)
spinbox.pack(pady=5)
ttk.Button(mw, text="Aceptar", command=
lambda: [callback(spinbox.get()), mw.destroy()]).pack(pady=10)
def ask_radiobutton(self, label, options, callback):
mw = tk.Toplevel(self.root)
mw.title(label)
tk.Label(mw, text=label).pack(pady=5)
sv = tk.StringVar(value=options[0])
for option in options:
tk.Radiobutton(mw, text=option, variable=sv, value=option).pack(anchor="w")
ttk.Button(mw, text="Aceptar", command=
lambda: [callback(sv.get()), mw.destroy()]).pack(pady=10)
def info(slef, message):
messagebox.showinfo("Información", message)
def main():
init_ssl()
locale.setlocale(locale.LC_TIME, "es_ES.UTF-8")
create_index()
root = Tk()
ui = RecipesUI(root)
def handle_action(action):
match(action):
case "load":
resp = messagebox.askyesno(title="Cargar", message="Quieres cargar todos los datos de nuevo?")
if resp:
recipes_count = persist_recipes()
ui.info(f"Se han indexado {recipes_count} recetas")
case "list_recipes":
ix = open_dir(DATA_DIR, "RecipesIndex")
with ix.searcher() as searcher:
recipes = searcher.search(query.Every(), limit=None)
clear = []
for r in recipes:
d = dict(r)
clear.append(d)
print(clear)
ui.show_list(clear, ["title", "difficulty", "units", "duration"])
# case "search_title_or_intro":
# def search_author(author):
# recipes = [recipe for recipe in dbm.get_all("recipes") if author.lower() in recipe["author"].lower()]
# for r in recipes:
# r["units"] = str(r["units"]) + " personas" if r["units"] is not None else "Unknown personas"
# r["duration"] = parse_duration_inverse(r["duration"])
# ui.show_list(recipes, ["title", "difficulty", "units", "duration", "author"])
# ui.ask_text("Buscar por autor: ", search_author)
# case "search_updated_at":
# def search_date(date):
# d = datetime.strptime(date, "%d/%m/%Y")
# recipes = [recipe for recipe in dbm.get_all("recipes")
# if d > datetime.strptime(recipe["updated_at"], "%Y-%m-%d %H:%M:%S")]
# for r in recipes:
# r["units"] = str(r["units"]) + " personas" if r["units"] is not None else "Unknown personas"
# r["duration"] = parse_duration_inverse(r["duration"])
# ui.show_list(recipes, ["title", "difficulty", "units", "duration", "updated_at"])
# ui.ask_text("Buscar por fecha: ", search_date)
# case "search_features_and_title":
# def search_author(author):
# recipes = [recipe for recipe in dbm.get_all("recipes") if author.lower() in recipe["author"].lower()]
# for r in recipes:
# r["units"] = str(r["units"]) + " personas" if r["units"] is not None else "Unknown personas"
# r["duration"] = parse_duration_inverse(r["duration"])
# ui.show_list(recipes, ["title", "difficulty", "units", "duration", "author"])
# ui.ask_text("Buscar por autor: ", search_author)
ui.callback = handle_action
root.mainloop()
if __name__ == "__main__":
main()