From 206530b235f459c63f29c4c9bcd93757567ca75d Mon Sep 17 00:00:00 2001
From: mlncn <ben@agaric.com>
Date: Sun, 2 May 2021 22:38:54 -0400
Subject: [PATCH] Use .apply rather than np.where for
 easier/better/not-randomly-failing individual parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

+# On some systems, using np.where worked but others failed.  Why it worked is
+# unknown but why it failed is because numpy where evaluates all parts, even
+# the parts that will never get used because the where clause does not apply!
+# This caused the chained strings to fail because— no string.

This worked fine on the System76 and didn't on Bridget's computer, even after
updating the version of python, but anyway, .apply() is better for what i am
trying to do here.

TODO convert other np.where uses to .apply

See #4 in https://datatofish.com/if-condition-in-pandas-dataframe/
---
 pomodoro_to_harvest.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pomodoro_to_harvest.py b/pomodoro_to_harvest.py
index caf6524..a5bb1df 100644
--- a/pomodoro_to_harvest.py
+++ b/pomodoro_to_harvest.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+import re
 import settings
 
 timelog = pd.read_csv(settings.pomodoro_logfile())
@@ -47,8 +48,15 @@ timelog['project'] = timelog['project'].str.strip()
 # task), then multiply the time by it and remove it from the description.
 # Ensure we're splitting on the same asterisk we found:  Use the end of string
 # signifier in the regular expression ($), and split from the right.
-timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
-timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
+p = re.compile(r'\*\s*\d$')
+# On some systems, using np.where worked but others failed.  Why it worked is
+# unknown but why it failed is because numpy where evaluates all parts, even
+# the parts that will never get used because the where clause does not apply!
+# This caused the chained strings to fail because— no string.
+# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
+# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
+timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
+timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
 timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
 timelog.drop(columns=['tmp_multiplier'], inplace=True)