parse-timelogs-for-upload/pomodoro_to_harvest.py
mlncn 206530b235 Use .apply rather than np.where for easier/better/not-randomly-failing individual parsing
+# On some systems, using np.where worked but others failed.  Why it worked is
+# unknown but why it failed is because numpy where evaluates all parts, even
+# the parts that will never get used because the where clause does not apply!
+# This caused the chained strings to fail because— no string.

This worked fine on the System76 and didn't on Bridget's computer, even after
updating the version of python, but anyway, .apply() is better for what i am
trying to do here.

TODO convert other np.where uses to .apply

See #4 in https://datatofish.com/if-condition-in-pandas-dataframe/
2021-05-02 22:38:54 -04:00

94 lines
5.2 KiB
Python

import pandas as pd
import numpy as np
import re
import settings
timelog = pd.read_csv(settings.pomodoro_logfile())
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
# need to drop the rows where the work task description is blank, which is
# coming in as not a number for reasons i'm not entirely clear on. Maybe
# because it's the last row of the spreadsheet? Anyway we cannot do anything
# with no data in the description, so drop them at the outset.
timelog = timelog.dropna()
timelog = timelog.reset_index(drop=True)
# For debugging, keep original description around.
timelog["orig_desc"] = timelog["description"]
# Clean up description before we go to work on it.
timelog['description'] = timelog['description'].str.strip()
# Allow multiple entries to be put into one prompt by splitting with semicolon.
# TODO make this a flag since it's possible to use semicolons without meaning
# to make multiple task entries at once.
timelog["description"] = list(timelog["description"].str.split(";"))
timelog = timelog.explode("description").reset_index()
timelog["started"] = pd.to_datetime(timelog["started"]).dt.tz_convert("US/Eastern")
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern")
timelog["time"] = 30
# A pomodoro started before 3am Eastern time is considered to be a continuation
# of the day before, so we are, effectively, on West Coast time for determining
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
# If a project has been specified (task prefixed with a colon), then put the
# project in its own column.
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None))
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description']))
# Mid-work clean up of description and new project.
timelog['description'] = timelog['description'].str.strip()
timelog['project'] = timelog['project'].str.strip()
# If a multiplier has been provided (an asterisk and an integer at the end of a
# task), then multiply the time by it and remove it from the description.
# Ensure we're splitting on the same asterisk we found: Use the end of string
# signifier in the regular expression ($), and split from the right.
p = re.compile(r'\*\s*\d$')
# On some systems, using np.where worked but others failed. Why it worked is
# unknown but why it failed is because numpy where evaluates all parts, even
# the parts that will never get used because the where clause does not apply!
# This caused the chained strings to fail because— no string.
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
timelog.drop(columns=['tmp_multiplier'], inplace=True)
# Clean up description again, after it has been sliced and diced.
timelog['description'] = timelog['description'].str.strip()
# Replace irregular-but-known project names with ones timetracking tools use.
replacement_project_names = {
"Find It Cambridge": ["Find It", "FIC", "Cambridge"],
"The Propaganda Site": ["TPS", "Propaganda Site"],
"MASS Design Group": ["MASS"],
"Teachers with GUTS": ["TWIG", "GUTS"],
"NICHQ Support": ["NICHQ", "NICHQ support", "nichq"],
"Network engagement": ["Network Engagement", "network engagement", "Network engagment", "Social media", "Network building", "Agaric network engagement"],
"Agaric internal": ["Agaric", "Internal"],
"Agaric contrib": ["Contributing", "Contrib"],
"Leads": ["Lead", "Agaric leads", "Lead followups"],
"Learning": ["Personal learning"],
"Drutopia": ["Drutopia improvements"],
"Personal / external": ["Personal/external", "Personal", "External"],
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support"],
}
# TODO Probably put all alternatives in lower case and do str.lower() on
# project just before the "is in" check.
for preferred, alternatives in replacement_project_names.items():
timelog.loc[timelog.project.isin(alternatives), "project"] = preferred
# Condense duplicate entries by date, summing the minutes spent, and listing
# the first started and last recorded times for each task.
# The fillna is essential or we drop entries with blank ('None') projects.
tl = timelog.groupby(["date", timelog.project.fillna(""), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
tl.to_csv('harvest-ready.csv', index = False)
settings.pomodoro_latest_recorded(tl.recorded.max())