bukosabino · bcosculluela · Mar 1, 2024
diff --git a/src/etls/dgt/README.md b/src/etls/dgt/README.md
@@ -0,0 +1,26 @@
+# Web principal
+
+[Web principal de la Dirección General de Tributos](https://www.hacienda.gob.es/es-ES/Areas%20Tematicas/Impuestos/Direccion%20General%20de%20Tributos/Paginas/Direccion%20general%20de%20tributos.aspx)
+
+
+# Doctrina
+
+[Doctrina de la DGT](https://www.hacienda.gob.es/es-ES/Normativa%20y%20doctrina/Doctrina/Paginas/default.aspx)
+
+# Portal de búsqueda 
+
+[Portal de búsqueda](https://petete.tributos.hacienda.gob.es/consultas/)
+
+# Ejemplo de documentos scrapeados
+
+[Doc1](https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=1&tab=1)
+[Doc2](https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=64247&tab=2)
+
+# Fechas a scrappear
+
+La primera consulta general se publicó el 03/01/1997.
+La primera consulta vinculante se publicó el 29/07/1997.
+
+# Frecuencia de actualización de la base de datos
+??
+
diff --git a/src/etls/dgt/__init__.py b/src/etls/dgt/__init__.py
diff --git a/src/etls/dgt/defs.py b/src/etls/dgt/defs.py
@@ -0,0 +1 @@
+COLLECTION_NAME = "dgt"
diff --git a/src/etls/dgt/load.py b/src/etls/dgt/load.py
@@ -0,0 +1,60 @@
+from datetime import date, datetime, timedelta
+
+import typer
+
+from src.email.send_email import send_email
+from src.etls.dgt.scrapper import DGTScrapper
+from src.etls.dgt.defs import COLLECTION_NAME
+from src.etls.common.etl import ETL
+from src.initialize import initialize_app
+
+
+app = typer.Typer()
+
+
+@app.command()
+def today(init_objects=None):
+    if init_objects is None:
+        init_objects = initialize_app()
+    etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
+    dgt_scrapper = DGTScrapper()
+    day = date.today()
+    docs = dgt_scrapper.download_day(day)
+    if docs:
+        etl_job.run(docs)
+
+    subject = "[DGT] Daily ETL executed"
+    content = f"""
+    Daily ETL executed
+    - Date: {day}
+    - Documents loaded: {len(docs)} 
+    - Database used: {init_objects.config_loader['vector_store']}
+    """
+    send_email(init_objects.config_loader, subject, content)
+
+
+@app.command()
+def dates(date_start: str, date_end: str, init_objects=None):
+    if init_objects is None:
+        init_objects = initialize_app()
+    etl_job = ETL(config_loader=init_objects.config_loader, vector_store=init_objects.vector_store[COLLECTION_NAME])
+    dgt_scrapper = DGTScrapper()
+    docs = dgt_scrapper.download_days(
+        date_start=datetime.strptime(date_start, "%Y/%m/%d").date(),
+        date_end=datetime.strptime(date_end, "%Y/%m/%d").date(),
+    )
+    if docs:
+        etl_job.run(docs)
+
+    subject = "[DGT] Load ETL executed"
+    content = f"""
+    Load ETL executed
+    - Date start: {date_start}
+    - Date end: {date_end}
+    - Documents loaded: {len(docs)} 
+    - Database used: {init_objects.config_loader['vector_store']}
+    """
+    send_email(init_objects.config_loader, subject, content)
+
+if __name__ == "__main__":
+    app()
diff --git a/src/etls/dgt/metadata.py b/src/etls/dgt/metadata.py
@@ -0,0 +1,36 @@
+from datetime import datetime
+from pydantic import field_validator
+
+from src.etls.common.metadata import MetadataDocument
+
+
+class DGTMetadataDocument(MetadataDocument):
+    """Class for keeping metadata of a DGT Document scrapped."""
+
+    # Text
+    filepath: str
+
+    # Source
+    source_name: str = "DGT"
+    source_type: str 
+
+    # Metadatos
+    identificador: str
+    numero_consulta: str
+    organo: str
+    normativa: str = ""
+    url_html: str
+    fecha_publicacion: str
+    fecha_disposicion: str = ""
+    anio: str
+    mes: str
+    dia: str
+
+    datetime_insert: str = datetime.utcnow().isoformat()
+
+    @field_validator("fecha_publicacion", "fecha_disposicion")
+    @classmethod
+    def isoformat(cls, v):
+        if v:
+            datetime.strptime(v, "%Y-%m-%d")
+        return v
diff --git a/src/etls/dgt/scrapper.py b/src/etls/dgt/scrapper.py
@@ -0,0 +1,180 @@
+import re
+import requests
+import tempfile
+import typing as tp
+import logging as lg
+from bs4 import BeautifulSoup
+from urllib.parse import urlencode
+from datetime import date, datetime
+from requests.exceptions import HTTPError
+
+from src.etls.common.scrapper import BaseScrapper
+from src.etls.dgt.metadata import DGTMetadataDocument
+from src.etls.dgt.utils import SEARCH_POST, DOC_POST, HEADERS, TARGET_CLASSES
+
+initialize_logging()
+
+def _extract_target_class(soup: BeautifulSoup, target_class: str) -> str:
+    """
+    Extracts the text of the next sibling for a span element that contains the specified text_label.
+
+    :param row: The BeautifulSoup row element to search within.
+    :param regex: The regular expresion to search for within the span element.
+    :return: The stripped text of the next sibling if found, otherwise an empty string.
+    """
+
+    num_consulta_value = ""
+    # Extraer la información deseada
+    num_consulta_tag = soup.find('tr', class_=target_class)
+    num_consulta_value_ = num_consulta_tag.find_all('p', class_=target_class)
+    for v in num_consulta_value_:
+        num_consulta_value += v.get_text(separator='\n', strip=True)
+
+    return num_consulta_value #num_consulta_label
+
+def _extract_metadata(soup) -> tp.Dict:
+    metadata_dict = {}
+
+    metadata_dict['source_type'] = soup.find('div', class_="doc_header").contents[-1].strip()
+
+    # Metadatos
+    if numero_consulta := _extract_target_class(soup, "NUM-CONSULTA"):
+        metadata_dict["numero_consulta"] = numero_consulta
+
+    if organo := _extract_target_class(soup, "ORGANO"):
+        metadata_dict["organo"] = organo
+
+    if normativa := _extract_target_class(soup, "NORMATIVA"):
+        metadata_dict["normativa"] = normativa
+
+    if fecha_publicacion := _extract_target_class(soup, "FECHA-SALIDA"):
+        fecha_publicacion = datetime.strptime(fecha_publicacion, "%d/%m/%Y").strftime("%Y-%m-%d")
+        metadata_dict["fecha_publicacion"] = fecha_publicacion
+        metadata_dict["fecha_disposicion"] = fecha_publicacion
+        metadata_dict["anio"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").year)
+        metadata_dict["mes"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").month)
+        metadata_dict["dia"] = str(datetime.strptime(fecha_publicacion, "%Y-%m-%d").day)
+
+    return metadata_dict
+
+def _extract_text(soup: BeautifulSoup, target_classes: str) -> str:
+    """
+    Extracts text from HTML elements with specific classes. Iterate over 
+    each class, look for elements with that class and check if they are root 
+    elements (not nested). If it is a root element, it adds its text to the
+    result.
+    """
+    extracted_text = ""
+    for class_name in target_classes:
+        for element in soup.find_all(class_=class_name):
+            parent_with_same_class = element.find_parent(class_=class_name)
+            if parent_with_same_class is None:
+                extracted_text += element.get_text(separator='\n', strip=True) + "\n\n"
+    return extracted_text    
+
+
+def _list_links_day(url: str, day_str: str) -> tp.List[BeautifulSoup]:
+    """Get a list of documents listed in a DGT url day
+
+    :param url: url base link. Example: 'https://petete.tributos.hacienda.gob.es/consultas/do/search'
+    :param day_str: str date to scrap
+    :return: return: list of id documents to explore
+    """
+    logger = lg.getLogger(_list_links_day.__name__)
+    logger.info("Scrapping day: %s", day_str)
+
+    SEARCH_POST['dateIni_2'] = day_str
+    SEARCH_POST['dateEnd_2'] = day_str 
+    SEARCH_POST['VLCMP_2'] = day_str + '..' + day_str 
+
+    extracted_docs = []
+    for tab in range(1,3):
+        SEARCH_POST['tab'] = tab
+        response = requests.post(url, data=SEARCH_POST, headers=HEADERS, verify=False)  # Omitir verificación SSL
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        if "La consulta realizada no devuelve resultados" in soup.text:
+            pass
+        else:   
+            # Extract total pages and current page
+            total_pages = int(soup.find('span', id='total_pages').text)   
+
+            for page in range(1, total_pages+1):
+                SEARCH_POST['page'] = str(page)         
+                response = requests.post(url, data=SEARCH_POST, headers=HEADERS, verify=False)  # Omitir verificación SSL
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, "html.parser")
+
+                # Find all the docs in the response which correspond to published enquiries  
+                # Use a regular expression to match 'id' attributes starting with 'doc_'
+                doc_ids = soup.find_all('td', id=re.compile('^doc_'))
+
+                # Extract the 'id' attribute from each matching tag
+                current_extracted_docs = [(doc['id'].split('_')[1], tab) for doc in doc_ids]
+                extracted_docs += current_extracted_docs
+
+    logger.info("Scrapped day successfully %s (%s DGT documents)", url, len(extracted_docs))
+
+    return extracted_docs
+
+class DGTScrapper(BaseScrapper):
+    def download_day(self, day: date) -> tp.List[DGTMetadataDocument]:
+        """Download all the documents for a specific date."""
+        logger = lg.getLogger(self.download_day.__name__)
+        logger.info("Downloading DGT content for day %s", day)
+        day_str = day.strftime("%d/%m/%Y")
+        url_search = "https://petete.tributos.hacienda.gob.es/consultas/do/search"
+        url = "https://petete.tributos.hacienda.gob.es/consultas/do/document"
+        metadata_documents = []
+        try:
+            docs = _list_links_day(url_search, day_str)
+            for id_doc, tab in docs:
+                try:
+                    DOC_POST['doc'] = id_doc   
+                    DOC_POST['tab'] = tab              
+                    encoded_params = urlencode(DOC_POST)                    
+                    url_document = f"{url}?{encoded_params}"                                  
+                    metadata_doc = self.download_document(url_document)
+                    if metadata_doc != None:
+                        metadata_documents.append(metadata_doc)
+                    else:
+                        logger.info("No data found in document %s", url)                        
+                except HTTPError:
+                    logger.error("Not scrapped document %s on day %s", url_document, day_str)
+                except AttributeError:
+                    logger.error("Not scrapped document %s on day %s", url_document, day_str)
+        except HTTPError:
+            logger.error("Not scrapped document on day %s", day_str)
+        logger.info("Downloaded DGT content for day %s", day_str)
+        return metadata_documents
+
+    def download_document(self, url: str) -> DGTMetadataDocument:
+        """Get text and metadata from a DGT document.
+
+        :param url: document url link. Examples:
+            * https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=64316&tab=2
+            * https://petete.tributos.hacienda.gob.es/consultas/do/document?doc=1&tab=1
+        :return: document with metadata and filepath with text content
+        """
+        logger = lg.getLogger(self.download_document.__name__)
+        logger.info("Scrapping document: %s", url)
+        response = requests.get(url, headers=HEADERS, verify=False)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        extracted_text = _extract_text(soup, TARGET_CLASSES)      
+
+        # Check if enquiry has content
+        if "Contestación completa" not in extracted_text:
+            logger.info("Scrapped document is empty: %s", url)
+            return 
+
+        with tempfile.NamedTemporaryFile("w", delete=False) as fn:
+            fn.write(extracted_text)
+        metadata_dict = _extract_metadata(soup)
+        metadata_dict["identificador"] = (url.split("=")[1]).split("&")[0]
+        metadata_dict["url_html"] = url
+        metadata_doc = DGTMetadataDocument(filepath=fn.name, **metadata_dict)
+        logger.info("Scrapped document successfully %s", url)
+        return metadata_doc
diff --git a/src/etls/dgt/utils.py b/src/etls/dgt/utils.py
@@ -0,0 +1,65 @@
+from datetime import datetime, timedelta
+
+# POST request data to do the search in DGT documents
+SEARCH_POST = {
+    'type1': 'on',
+    'type2': 'on',
+    'NMCMP_1': 'NUM-CONSULTA',
+    'VLCMP_1': '',
+    'OPCMP_1': '.Y',
+    'NMCMP_2': 'FECHA-SALIDA',
+    'VLCMP_2': '', 
+    'dateIni_2': '', 
+    'dateEnd_2': '',
+    'OPCMP_2': '.Y',
+    'NMCMP_3': 'NORMATIVA',
+    'VLCMP_3': '',
+    'OPCMP_3': '.Y',
+    'NMCMP_4': 'CUESTION-PLANTEADA',
+    'VLCMP_4': '',
+    'OPCMP_4': '.Y',
+    'NMCMP_5': 'DESCRIPCION-HECHOS',
+    'VLCMP_5': '',
+    'OPCMP_5': '.Y',
+    'NMCMP_6': 'FreeText',
+    'VLCMP_6': '',
+    'OPCMP_6': '.Y',
+    'NMCMP_7': 'CRITERIO',
+    'cmpOrder': 'NUM-CONSULTA',
+    'dirOrder': '0',
+    'auto': '',
+    'tab': '2',
+    'page': '1'
+}
+
+
+# POST request explore data in DGT search
+DOC_POST = {
+    'doc': '', # doc_id
+    'tab': ''
+}
+
+# HTTP headers from request
+HEADERS = {
+    'Referer': 'https://petete.tributos.hacienda.gob.es/consultas',
+    'X-Requested-With': 'XMLHttpRequest',
+}
+
+# Target classes for extracting text
+#  ["NUM-CONSULTA", "ORGANO", "FECHA-SALIDA", "NORMATIVA", "DESCRIPCION-HECHOS", 
+#   "CUESTION-PLANTEADA", "CONTESTACION-COMPL"]  
+TARGET_CLASSES = ["NORMATIVA", "DESCRIPCION-HECHOS", "CUESTION-PLANTEADA", "CONTESTACION-COMPL"]
+
+def get_previous_month_dates(t_date=None):
+    """
+    Get the start date and end date of the previous month.
+    """
+    if t_date == None:
+        t_date = datetime.now()
+    first_day_previous_month = datetime(t_date.year, t_date.month, 1) - timedelta(days=1)
+    first_day_previous_month = datetime(first_day_previous_month.year, first_day_previous_month.month, 1)
+    last_day_previous_month = datetime(t_date.year, t_date.month, 1) - timedelta(days=1)
+
+    return first_day_previous_month, last_day_previous_month
+
+