From 206530b235f459c63f29c4c9bcd93757567ca75d Mon Sep 17 00:00:00 2001 From: mlncn Date: Sun, 2 May 2021 22:38:54 -0400 Subject: [PATCH] Use .apply rather than np.where for easier/better/not-randomly-failing individual parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit +# On some systems, using np.where worked but others failed. Why it worked is +# unknown but why it failed is because numpy where evaluates all parts, even +# the parts that will never get used because the where clause does not apply! +# This caused the chained strings to fail because— no string. This worked fine on the System76 and didn't on Bridget's computer, even after updating the version of python, but anyway, .apply() is better for what i am trying to do here. TODO convert other np.where uses to .apply See #4 in https://datatofish.com/if-condition-in-pandas-dataframe/ --- pomodoro_to_harvest.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pomodoro_to_harvest.py b/pomodoro_to_harvest.py index caf6524..a5bb1df 100644 --- a/pomodoro_to_harvest.py +++ b/pomodoro_to_harvest.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import re import settings timelog = pd.read_csv(settings.pomodoro_logfile()) @@ -47,8 +48,15 @@ timelog['project'] = timelog['project'].str.strip() # task), then multiply the time by it and remove it from the description. # Ensure we're splitting on the same asterisk we found: Use the end of string # signifier in the regular expression ($), and split from the right. -timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1)) -timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description'])) +p = re.compile(r'\*\s*\d$') +# On some systems, using np.where worked but others failed. Why it worked is +# unknown but why it failed is because numpy where evaluates all parts, even +# the parts that will never get used because the where clause does not apply! +# This caused the chained strings to fail because— no string. +# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1)) +# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description'])) +timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1) +timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x) timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int) timelog.drop(columns=['tmp_multiplier'], inplace=True)