Use .apply rather than np.where for easier/better/not-randomly-failing individual parsing

+# On some systems, using np.where worked but others failed. Why it worked is +# unknown but why it failed is because numpy where evaluates all parts, even +# the parts that will never get used because the where clause does not apply! +# This caused the chained strings to fail because— no string. This worked fine on the System76 and didn't on Bridget's computer, even after updating the version of python, but anyway, .apply() is better for what i am trying to do here. TODO convert other np.where uses to .apply See #4 in https://datatofish.com/if-condition-in-pandas-dataframe/
2021-05-02 22:38:54 -04:00 · 2021-05-02 22:38:54 -04:00 · 206530b235
commit 206530b235
parent d8a5e27a97
1 changed files with 10 additions and 2 deletions
--- a/pomodoro_to_harvest.py
+++ b/pomodoro_to_harvest.py
@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
+import re
 import settings

 timelog = pd.read_csv(settings.pomodoro_logfile())
@ -47,8 +48,15 @@ timelog['project'] = timelog['project'].str.strip()
 # task), then multiply the time by it and remove it from the description.
 # Ensure we're splitting on the same asterisk we found:  Use the end of string
 # signifier in the regular expression ($), and split from the right.
-timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
-timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
+p = re.compile(r'\*\s*\d$')
+# On some systems, using np.where worked but others failed.  Why it worked is
+# unknown but why it failed is because numpy where evaluates all parts, even
+# the parts that will never get used because the where clause does not apply!
+# This caused the chained strings to fail because— no string.
+# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
+# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
+timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
+timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
 timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
 timelog.drop(columns=['tmp_multiplier'], inplace=True)