2021-04-27 13:41:44 +00:00
|
|
|
import pandas as pd
|
2021-04-28 01:45:37 +00:00
|
|
|
import numpy as np
|
2021-05-25 19:57:52 +00:00
|
|
|
import glob
|
2021-05-03 02:38:54 +00:00
|
|
|
import re
|
2021-05-03 04:04:18 +00:00
|
|
|
import sys
|
|
|
|
# Import our local settings management.
|
2021-05-02 22:39:37 +00:00
|
|
|
import settings
|
2021-04-27 13:41:44 +00:00
|
|
|
|
2021-05-25 19:57:52 +00:00
|
|
|
# This works for one file:
|
|
|
|
# timelog = pd.read_csv(settings.pomodoro_logfile())
|
|
|
|
|
|
|
|
# For multiple files:
|
|
|
|
path = settings.pomodoro_logpath()
|
|
|
|
all_files = glob.glob(path + "/*.csv")
|
|
|
|
|
|
|
|
li = []
|
|
|
|
|
|
|
|
for filename in all_files:
|
|
|
|
df = pd.read_csv(filename, index_col=None, header=0)
|
|
|
|
li.append(df)
|
|
|
|
|
|
|
|
timelog = pd.concat(li, axis=0, ignore_index=True)
|
|
|
|
|
|
|
|
timelog = timelog.dropduplicates(inplace=True)
|
2021-04-27 13:41:44 +00:00
|
|
|
|
2021-04-28 03:37:37 +00:00
|
|
|
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
|
|
|
|
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
|
|
|
|
# need to drop the rows where the work task description is blank, which is
|
|
|
|
# coming in as not a number for reasons i'm not entirely clear on. Maybe
|
|
|
|
# because it's the last row of the spreadsheet? Anyway we cannot do anything
|
|
|
|
# with no data in the description, so drop them at the outset.
|
|
|
|
timelog = timelog.dropna()
|
|
|
|
timelog = timelog.reset_index(drop=True)
|
|
|
|
|
2021-04-28 04:14:48 +00:00
|
|
|
# For debugging, keep original description around.
|
|
|
|
timelog["orig_desc"] = timelog["description"]
|
|
|
|
|
|
|
|
# Clean up description before we go to work on it.
|
|
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
|
2021-04-28 04:16:15 +00:00
|
|
|
# Allow multiple entries to be put into one prompt by splitting with semicolon.
|
|
|
|
# TODO make this a flag since it's possible to use semicolons without meaning
|
|
|
|
# to make multiple task entries at once.
|
2021-04-28 04:14:48 +00:00
|
|
|
timelog["description"] = list(timelog["description"].str.split(";"))
|
|
|
|
timelog = timelog.explode("description").reset_index()
|
|
|
|
|
2021-04-27 15:30:53 +00:00
|
|
|
timelog["started"] = pd.to_datetime(timelog["started"]).dt.tz_convert("US/Eastern")
|
|
|
|
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern")
|
2021-05-03 04:01:59 +00:00
|
|
|
|
|
|
|
latest_recorded = settings.pomodoro_latest_recorded()
|
|
|
|
if latest_recorded:
|
2021-05-03 04:13:07 +00:00
|
|
|
timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)]
|
2021-05-03 04:01:59 +00:00
|
|
|
|
2021-04-27 15:06:20 +00:00
|
|
|
timelog["time"] = 30
|
|
|
|
# A pomodoro started before 3am Eastern time is considered to be a continuation
|
|
|
|
# of the day before, so we are, effectively, on West Coast time for determining
|
|
|
|
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
|
|
|
|
timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date
|
2021-04-27 15:31:27 +00:00
|
|
|
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
|
2021-04-27 15:32:34 +00:00
|
|
|
|
2021-04-28 02:30:07 +00:00
|
|
|
# If a project has been specified (task prefixed with a colon), then put the
|
|
|
|
# project in its own column.
|
2021-04-28 01:45:37 +00:00
|
|
|
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None))
|
|
|
|
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description']))
|
|
|
|
|
2021-04-28 15:05:02 +00:00
|
|
|
# Mid-work clean up of description and new project.
|
2021-04-28 02:40:15 +00:00
|
|
|
timelog['description'] = timelog['description'].str.strip()
|
2021-04-28 15:05:02 +00:00
|
|
|
timelog['project'] = timelog['project'].str.strip()
|
2021-04-28 02:40:15 +00:00
|
|
|
|
2021-04-28 02:30:38 +00:00
|
|
|
# If a multiplier has been provided (an asterisk and an integer at the end of a
|
|
|
|
# task), then multiply the time by it and remove it from the description.
|
2021-04-28 03:37:37 +00:00
|
|
|
# Ensure we're splitting on the same asterisk we found: Use the end of string
|
|
|
|
# signifier in the regular expression ($), and split from the right.
|
2021-05-03 02:38:54 +00:00
|
|
|
p = re.compile(r'\*\s*\d$')
|
|
|
|
# On some systems, using np.where worked but others failed. Why it worked is
|
|
|
|
# unknown but why it failed is because numpy where evaluates all parts, even
|
|
|
|
# the parts that will never get used because the where clause does not apply!
|
|
|
|
# This caused the chained strings to fail because— no string.
|
|
|
|
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
|
|
|
|
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
|
|
|
|
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
|
|
|
|
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
|
2021-04-28 03:37:37 +00:00
|
|
|
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
|
2021-04-28 03:51:26 +00:00
|
|
|
timelog.drop(columns=['tmp_multiplier'], inplace=True)
|
2021-04-28 03:37:37 +00:00
|
|
|
|
|
|
|
# Clean up description again, after it has been sliced and diced.
|
2021-04-28 02:40:15 +00:00
|
|
|
timelog['description'] = timelog['description'].str.strip()
|
2021-04-28 01:45:37 +00:00
|
|
|
|
2021-04-28 15:14:39 +00:00
|
|
|
# Replace irregular-but-known project names with ones timetracking tools use.
|
|
|
|
replacement_project_names = {
|
|
|
|
"Find It Cambridge": ["Find It", "FIC", "Cambridge"],
|
|
|
|
"The Propaganda Site": ["TPS", "Propaganda Site"],
|
|
|
|
"MASS Design Group": ["MASS"],
|
|
|
|
"Teachers with GUTS": ["TWIG", "GUTS"],
|
2021-04-30 13:19:10 +00:00
|
|
|
"NICHQ Support": ["NICHQ", "NICHQ support", "nichq"],
|
2021-04-28 15:14:39 +00:00
|
|
|
"Network engagement": ["Network Engagement", "network engagement", "Network engagment", "Social media", "Network building", "Agaric network engagement"],
|
|
|
|
"Agaric internal": ["Agaric", "Internal"],
|
|
|
|
"Agaric contrib": ["Contributing", "Contrib"],
|
|
|
|
"Leads": ["Lead", "Agaric leads", "Lead followups"],
|
|
|
|
"Learning": ["Personal learning"],
|
2021-04-28 15:42:00 +00:00
|
|
|
"Drutopia": ["Drutopia improvements"],
|
2021-04-28 15:14:39 +00:00
|
|
|
"Personal / external": ["Personal/external", "Personal", "External"],
|
|
|
|
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support"],
|
|
|
|
}
|
|
|
|
# TODO Probably put all alternatives in lower case and do str.lower() on
|
|
|
|
# project just before the "is in" check.
|
|
|
|
for preferred, alternatives in replacement_project_names.items():
|
|
|
|
timelog.loc[timelog.project.isin(alternatives), "project"] = preferred
|
|
|
|
|
2021-04-27 23:48:46 +00:00
|
|
|
# Condense duplicate entries by date, summing the minutes spent, and listing
|
|
|
|
# the first started and last recorded times for each task.
|
2021-04-28 15:15:53 +00:00
|
|
|
# The fillna is essential or we drop entries with blank ('None') projects.
|
2021-04-28 15:08:43 +00:00
|
|
|
tl = timelog.groupby(["date", timelog.project.fillna(""), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
|
|
|
|
|
2021-05-03 04:13:07 +00:00
|
|
|
if not hasattr(sys, 'ps1'):
|
2021-05-03 04:01:59 +00:00
|
|
|
tl.to_csv('harvest-ready.csv', index=False)
|
|
|
|
settings.pomodoro_latest_recorded(tl.recorded.max())
|
|
|
|
else:
|
|
|
|
print("We do not write to the harvest-ready.csv nor update the latest recorded setting when run interactively in the python shell.")
|