Source code for astrogen_utils

import jellyfish
import difflib
import pandas as pd
import numpy as np
import operator
import re 
import os
from openpyxl import load_workbook
from pathlib import Path

path = Path(__file__).parent / "../../data/external/nombres.csv"
gender_list = pd.read_csv(path)


# COLORS ::::::::::::::::::::::::::::::::::::::::::::::::

[docs]class bcolors:
    """
    Get color palette for pretty printing

    This class simply contains a list of predefined colors to
    be used in the visual analysis of strings and publication
    data.

    """
    # ANSI escape sequences
    # Ver también el paquete colorama
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    TST = '\033[31;1m'
    X = '\033[4;95;1m'


# NAME MATCHING :::::::::::::::::::::::::::::::::::::::

[docs]def ds(a, b):                                           
    """
    Get distance between two words.

    This function is used to obtain the distance between two names
    or surnames. Uses different distances in word space, namely,
    Damerau Levenshtein distance, Jaro distance, Levenstein
    distance and SequenceMatcher. The later from the difflib package
    and the other ones from the Jellyfish package.

    Args:
        a (string): one of the strings
        b (string): the other string to compare

    Returns:
        res (array): Numpy array with the list of distances
           between the two words.

    """
    d1 = jellyfish.damerau_levenshtein_distance(a, b)
    d2 = jellyfish.jaro_distance(a, b)     
    d3 = jellyfish.levenshtein_distance(a, b)                          
    s = difflib.SequenceMatcher(None, a, b)                            
    d4 = s.ratio()                                                
    res = np.array([d1, d2, d3, d4])
    return res

[docs]def ds1(s1, s2):
    """
    Get distance between two words.

    This function is used to obtain the distance between two names
    or surnames. Uses different distances in word space, namely,
    Damerau Levenshtein distance, Jaro distance, Levenstein
    distance and SequenceMatcher. The later from the difflib package
    and the other ones from the Jellyfish package.

    Args:
        a (string): one of the strings
        b (string): the other string to compare

    Returns:
        res (array): Numpy array with the list of distances
           between the two words.

    """ 
    s1l = s1.lower().split()
    s2l = s2.lower().split()
    n1 = len(s1l)
    n2 = len(s2l)
    if n1 > 1: s1l.append(s1.lower())
    if n2 > 1: s2l.append(s2.lower())
    dm = 99
    for p1 in s1l:
        for p2 in s2l:
            s = difflib.SequenceMatcher(None, p1, p2)
            d = 1 - s.ratio()                                                
            dm = min(dm, d)
    return dm

[docs]def ds2(ap1, ap2, nom1, nom2):
    """
    Get distance between two words.

    This function is used to obtain the distance between two names
    or surnames. Uses different distances in word space, namely,
    Damerau Levenshtein distance, Jaro distance, Levenstein
    distance and SequenceMatcher. The later from the difflib package
    and the other ones from the Jellyfish package.

    Args:
        a (string): one of the strings
        b (string): the other string to compare

    Returns:
        res (array): Numpy array with the list of distances
           between the two words.

    """    
    d_apel = ds1(ap1, ap2)
    d_nomb = ds1(nom1, nom2)
    names_dist = np.sqrt(d_apel**2 + d_nomb**2)
    return names_dist

[docs]def initials(initials, string):
    """
    Check if the initials of two names coincide.

    e.g.:

    initials = 'Juan Carlos'; string='Juan' --> True

    initials = 'Juan Carlos'; string='Juan José' --> False

    initials = 'Juan Carlos'; string='Jacinto' --> True

    Args:
        initials (string): source string for the initials
        string (string): full names

    Returns:
        boo (bool): whether the initials are accepted

    Notes:

    The criteria for the string matching is the following:

    """
    Li = [x[0] for x in initials.lower().replace('.', ' ').split()]
    Ln = [x[0] for x in string.lower().replace('.', ' ').split()]
    ni = len(Li)
    nn = len(Ln)
    boo = Li==Ln
    if (ni==1 or nn==1) and ni!=nn:
        boo = Li[0]==Ln[0]
    return boo

[docs]def getinitials(nombre):
    """
    Get the initials of a full name

    e.g.: 'Jose Facundo' --> 'J. F.'

    Args:

    Returns:

    """
    res = ' '.join([a[0].upper()+'.' for a in nombre.split()])
    return res

[docs]def getinitialscompact(nombre):
    """
    Get the initials of a full name

    e.g.: 'Jose Facundo' --> 'JF'

    Args:

    Returns:

    """
    res = ''.join([a[0].upper() for a in nombre.split()])
    return res  
 
[docs]def pickone(df, au, sift):
    """
    de una lista de autores en un dataframe "df" 
    elige el que está más cerca de un autor "au"
    y devuelve un array booleano que es todo falso
    savo uno (el autor elegido).
    """
    a1, n1 = au.split(',')
    dopt = 99
    ind = 0
    for i, r in enumerate(df.iterrows()):
        if not sift[i]: continue
        a2, n2 = r[1][0], r[1][1]
        d = ds2(a1, a2, n1, n2)
        if d < dopt:
            dopt = d
            ind = i
    sift = [False]*len(sift)
    sift[ind] = True
    return sift
 
[docs]def similar(a, b):
    r = difflib.SequenceMatcher(None, a, b).ratio()
    return r

# DATA GOVERNANCE :::::::::::::::::::::::::::::::::::::::

[docs]def fnames(auth, folder, extension, include_path=True):
    """
    build the file name
    """
    from os.path import join as pathjoin

    ap = auth.apellido.title()
    fname_ap = '_'.join(ap.split())
    nm = auth.nombre

    fname_nm = ''.join([a[0].upper() for a in nm.split()])
    fname = '_'.join([fname_ap, fname_nm])
    if include_path:
        filen = pathjoin(folder, fname + extension)
    else:
        filen = fname + extension

    return filen



# TEXT MANIPULATION ::::::::::::::::::::::::::::::::::::::::::::

[docs]def clean_text(txt):
    txt = re.sub("[^a-záéíóúñüäë]", " ", txt.lower())
    txt = re.sub(' +',' ', txt)
    return txt.strip().split()

[docs]def df_to_dict(df, key_column, val_column):
    """convierte dos pandas series en un diccionario"""
    xkey = df[key_column].tolist()
    xval = df[val_column].tolist()
    return dict(zip(xkey,xval))

gender_list = df_to_dict(gender_list, key_column='nombre', val_column='genero')

# GENDER DETECTION :::::::::::::::::::::::::::::::::::::::::::::

[docs]def get_gender2(names):
    names = clean_text(names)
    names = [x for x in names if gender_list.get(x,'a') != 'a']
    gender ={'m':0, 'f':0, 'a':0}
    for i, name in enumerate(names):
        g = gender_list.get(name,'a')
        gender[g] += 1
        gender[g] += 2 if len(names) > 1 and i == 0 and g != 'a' else 0 
    gender['a'] = 0 if (gender['f']+gender['m']) > 0 else 1
    return max(gender.items(), key=operator.itemgetter(1))[0]

 
 
# XLSX WRITERS :::::::::::::::::::::::::::::::::::::::::::::::::

[docs]def append_df_to_excel(filename, df, sheet_name='Sheet1', startrow=None,
                       truncate_sheet=False, 
                       **to_excel_kwargs):
    """
    Append a DataFrame [df] to existing Excel file [filename]
    into [sheet_name] Sheet.
    If [filename] doesn't exist, then this function will create it.

    @param filename: File path or existing ExcelWriter
    (Example: '/path/to/file.xlsx')
    @param df: DataFrame to save to workbook
    @param sheet_name: Name of sheet which will contain DataFrame.
    (default: 'Sheet1')
    @param startrow: upper left cell row to dump data frame.
    Per default (startrow=None) calculate the last row
    in the existing DF and write to the next row...
    @param truncate_sheet: truncate (remove and recreate) [sheet_name]
    before writing DataFrame to Excel file
    @param to_excel_kwargs: arguments which will be passed to `DataFrame.to_excel()`
    [can be a dictionary]
    @return: None

    Usage examples:

    >>> append_df_to_excel('d:/temp/test.xlsx', df)

    >>> append_df_to_excel('d:/temp/test.xlsx', df, header=None, index=False)

    >>> append_df_to_excel('d:/temp/test.xlsx', df, sheet_name='Sheet2',
                           index=False)

    >>> append_df_to_excel('d:/temp/test.xlsx', df, sheet_name='Sheet2', 
                           index=False, startrow=25)

    (c) [MaxU](https://stackoverflow.com/users/5741205/maxu?tab=profile)
    """
    # Excel file doesn't exist - saving and exiting
    if not os.path.isfile(filename):
        df.to_excel(
            filename,
            sheet_name=sheet_name, 
            startrow=startrow if startrow is not None else 0, 
            **to_excel_kwargs)
        return
    
    # ignore [engine] parameter if it was passed
    if 'engine' in to_excel_kwargs:
        to_excel_kwargs.pop('engine')

    writer = pd.ExcelWriter(filename, engine='openpyxl', mode='a')

    # try to open an existing workbook
    writer.book = load_workbook(filename)
    
    # get the last row in the existing Excel sheet
    # if it was not specified explicitly
    if startrow is None and sheet_name in writer.book.sheetnames:
        startrow = writer.book[sheet_name].max_row

    # truncate sheet
    if truncate_sheet and sheet_name in writer.book.sheetnames:
        # index of [sheet_name] sheet
        idx = writer.book.sheetnames.index(sheet_name)
        # remove [sheet_name]
        writer.book.remove(writer.book.worksheets[idx])
        # create an empty sheet [sheet_name] using old index
        writer.book.create_sheet(sheet_name, idx)
    
    # copy existing sheets
    writer.sheets = {ws.title:ws for ws in writer.book.worksheets}

    if startrow is None:
        startrow = 0

    # write out the new sheet
    df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs)

    # save the workbook
    writer.save()