parse-timelogs-for-upload/pomodoro_to_harvest.py

60 lines
3.3 KiB
Python
Raw Normal View History

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
timelog = pd.read_csv("timelog-titled.csv")
2021-04-28 03:37:37 +00:00
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
# need to drop the rows where the work task description is blank, which is
# coming in as not a number for reasons i'm not entirely clear on. Maybe
# because it's the last row of the spreadsheet? Anyway we cannot do anything
# with no data in the description, so drop them at the outset.
timelog = timelog.dropna()
timelog = timelog.reset_index(drop=True)
# For debugging, keep original description around.
timelog["orig_desc"] = timelog["description"]
# Clean up description before we go to work on it.
timelog['description'] = timelog['description'].str.strip()
2021-04-28 04:16:15 +00:00
# Allow multiple entries to be put into one prompt by splitting with semicolon.
# TODO make this a flag since it's possible to use semicolons without meaning
# to make multiple task entries at once.
timelog["description"] = list(timelog["description"].str.split(";"))
timelog = timelog.explode("description").reset_index()
timelog["started"] = pd.to_datetime(timelog["started"]).dt.tz_convert("US/Eastern")
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern")
timelog["time"] = 30
# A pomodoro started before 3am Eastern time is considered to be a continuation
# of the day before, so we are, effectively, on West Coast time for determining
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date
2021-04-27 15:31:27 +00:00
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
2021-04-28 02:30:07 +00:00
# If a project has been specified (task prefixed with a colon), then put the
# project in its own column.
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None))
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description']))
# Mid-work clean up of description.
timelog['description'] = timelog['description'].str.strip()
# If a multiplier has been provided (an asterisk and an integer at the end of a
# task), then multiply the time by it and remove it from the description.
2021-04-28 03:37:37 +00:00
# Ensure we're splitting on the same asterisk we found: Use the end of string
# signifier in the regular expression ($), and split from the right.
timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
timelog.drop(columns=['tmp_multiplier'], inplace=True)
2021-04-28 03:37:37 +00:00
# Clean up description again, after it has been sliced and diced.
timelog['description'] = timelog['description'].str.strip()
# Condense duplicate entries by date, summing the minutes spent, and listing
# the first started and last recorded times for each task.
tl = timelog.groupby(["date", "project", "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()