parse-timelogs-for-upload/pomodoro_to_harvest.py

229 lines
12 KiB
Python
Raw Normal View History

import pandas as pd
import numpy as np
import glob
import re
2021-05-03 04:04:18 +00:00
import sys
# Import our local settings management.
2021-05-02 22:39:37 +00:00
import settings
if hasattr(sys, 'ps1'):
import copy
debug = True
else:
debug = False
if settings.pomodoro_logfile():
# This works for one file:
timelog = pd.read_csv(settings.pomodoro_logfile())
else:
# For multiple files:
path = settings.pomodoro_logpath()
all_files = glob.glob(path + "*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
timelog = pd.concat(li, axis=0, ignore_index=True)
if debug:
imported = copy.deepcopy(timelog)
2021-05-26 16:48:36 +00:00
timelog.drop_duplicates(inplace=True)
if debug:
nodupes = copy.deepcopy(timelog)
2021-04-28 03:37:37 +00:00
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
# need to drop the rows where the work task description is blank, which is
# coming in as not a number for reasons i'm not entirely clear on. Maybe
# because it's the last row of the spreadsheet? Anyway we cannot do anything
# with no data in the description, so drop them at the outset.
# We can allow no data in the 'intention' so define the three columns to check:
timelog = timelog.dropna(subset=['started', 'recorded', 'description'])
2021-04-28 03:37:37 +00:00
timelog = timelog.reset_index(drop=True)
if debug:
dropna = copy.deepcopy(timelog)
# For debugging, keep originals around.
timelog["orig_desc"] = timelog["description"]
timelog["orig_started"] = timelog["started"]
timelog["orig_recorded"] = timelog["recorded"]
# Clean up description before we go to work on it.
timelog['description'] = timelog['description'].str.strip()
2021-04-28 04:16:15 +00:00
# Allow multiple entries to be put into one prompt by splitting with semicolon.
# TODO make this a flag since it's possible to use semicolons without meaning
# to make multiple task entries at once.
timelog["description"] = list(timelog["description"].str.split(";"))
timelog = timelog.explode("description").reset_index()
if debug:
mess = copy.deepcopy(timelog)
timelog["started"] = pd.to_datetime(timelog["started"], errors='coerce').dt.tz_convert("US/Eastern")
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern")
latest_recorded = settings.pomodoro_latest_recorded()
if latest_recorded:
timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)]
timelog["time"] = 30
# A pomodoro started before 3am Eastern time is considered to be a continuation
# of the day before, so we are, effectively, on West Coast time for determining
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date
2021-04-27 15:31:27 +00:00
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
2021-04-28 02:30:07 +00:00
# If a project has been specified (task prefixed with a colon), then put the
# project in its own column.
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None))
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description']))
2021-04-28 15:05:02 +00:00
# Mid-work clean up of description and new project.
timelog['description'] = timelog['description'].str.strip()
2021-04-28 15:05:02 +00:00
timelog['project'] = timelog['project'].str.strip()
# If a multiplier has been provided (an asterisk and an integer at the end of a
# task), then multiply the time by it and remove it from the description.
2021-04-28 03:37:37 +00:00
# Ensure we're splitting on the same asterisk we found: Use the end of string
# signifier in the regular expression ($), and split from the right.
p = re.compile(r'\*\s*\d$')
# On some systems, using np.where worked but others failed. Why it worked is
# unknown but why it failed is because numpy where evaluates all parts, even
# the parts that will never get used because the where clause does not apply!
# This caused the chained strings to fail because— no string.
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
2021-04-28 03:37:37 +00:00
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
timelog.drop(columns=['tmp_multiplier'], inplace=True)
2021-04-28 03:37:37 +00:00
# Clean up description again, after it has been sliced and diced.
timelog['description'] = timelog['description'].str.strip()
# Mostly historical, helper for breaking compound project-tasks into their constituent parts
compound_project_tasks = {
"Drutopia — Contributing back to the community": ["Drutopia contrib", "Drutopia contributing", "Drutopia contributions"],
"Find It Cambridge — Contributing back to the community": ["Find It Contrib"],
"Find It Cambridge — Planning": ["Find It project management"],
"Internal — Contributing back to the community": ["Contrib", "Agaric: contrib", "Contributing", "Agaric contrib", "Agaric contributions"],
"Internal — Network Engagement": ["Network Engagement", "network engagement", "Network engagment", "Social media", "Network building", "Agaric network engagement"],
"Internal — Content": ["Agaric site content", "Agaric content"],
"VHFA — Contributing back to the community": ["VHFA contrib"],
}
for preferred, alternatives in compound_project_tasks.items():
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives.append(preferred)
alternatives = [item.lower() for item in alternatives]
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
# If a compound project was specified, break that out into a sub-project (in
# Harvest, we use Task, which is really task type, for this.
2021-06-18 21:12:11 +00:00
timelog['subproject'] = (np.where(timelog['project'].str.contains(''), timelog['project'].str.split('', 1).str[1], None))
timelog['project'] = (np.where(timelog['project'].str.contains(''), timelog['project'].str.split('', 1).str[0], timelog['project']))
2021-04-28 15:14:39 +00:00
# Replace irregular-but-known project names with ones timetracking tools use.
harvest_project_names = {
"Boston Modern Orchestra Project": ["BMOP", "BMOP.org"],
"CRLA.org upgrade": ["CRLA", "CRLA upgrade"],
2021-06-02 09:36:41 +00:00
"Cockrill Precision Products": ["Cockrill Corp", "Cockrill"],
2021-06-02 17:12:09 +00:00
"Cultura Continued Support": ["Cultura", "MIT Cultura"],
"Drutopia": ["Drutopia improvements", "Drutopia overhead"],
2021-06-02 09:36:41 +00:00
"EC Connect": ["eccconectcolorado.org", "Denver Econnect", "Denver Early Childhood", "ECconnect", "ECconnectColorado"],
"Eliot School Site & CRM": ["Eliot", "Eliot School"],
2021-06-02 17:12:09 +00:00
"encuentro 5 sites": ["Encuentro5", "e5", "Encuentro"],
2021-05-28 18:38:09 +00:00
"Family & Home": ["Family and Home", "Family home"],
2021-04-28 15:14:39 +00:00
"Find It Cambridge": ["Find It", "FIC", "Cambridge"],
"GEO Support": ["GEO", "GEO.coop", "Grassroots Economic Organizing"],
2021-06-02 09:36:41 +00:00
"Immigrant Navigator": ["IFSI", "Immigrant Family Services"],
2021-06-01 15:16:06 +00:00
"Internal": ["Agaric", "Agaric internal"],
2021-06-01 15:06:02 +00:00
"Leads": ["Lead", "Agaric leads", "Lead followups"],
2021-06-01 15:16:06 +00:00
"Internal: Personal Learning": ["Learning", "Personal learning"],
"MASS Continuous Improvement": ["MASS Design Group", "MASS", "MASS Design"],
2021-06-02 14:45:34 +00:00
"Metalwerx Maintenance": ["Metalwerx", "Metalwerx.com"],
"NICHQ Data Upgrade": ["NICHQ Data"],
"NICHQ Support": ["NICHQ", "NICHQ maintenance"],
2021-06-01 20:39:49 +00:00
"NICHQ FL CMS LAN": ["FL CMS LAN", "flcmslan", "NICHQ FLCMSLAN"],
2021-06-02 17:12:09 +00:00
"OAG - Office of Opportunity and Achievement Gaps Task Force": ["Boston Public Schools", "BPS", "OAG"],
2021-06-02 09:36:41 +00:00
"Portside": ["Portside.org Improvements 2020", "portside.org", "Portside support"],
"SCDTDP Collaboratory Data Site System Security": ["SCDTDP", "NICHQ SCDTDP", "NICHQ security"],
"Project GUTS/TWIG/Making Sense of Models": ["Teachers with GUTS", "TWIG", "GUTS", "Project GUTS"],
2021-06-01 15:06:02 +00:00
"The Propaganda Site": ["TPS", "Propaganda Site", "The Propganda Site", "Murat & Clay"],
2021-06-02 17:12:09 +00:00
"VHFA": ["Vermont Housing Finance Agency", "Vermont", "Vermont Housing"],
"Vulk redesign": ["Vulk", "Vulk.coop"],
2021-04-28 15:14:39 +00:00
}
other_project_names = {
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support", "NN camp defense", "NN camp", "NN defense", "Near North camp defense", "Camp", "Near North"],
"Personal": ["Personal/external", "Personal / external", "External"],
2021-06-02 17:12:09 +00:00
"Tzedakah": ["Community support"],
"PWGD": ["People Who Give a Damn", "PWGD Inc"],
"Workers Defense Alliance": ["WDA", "Alliance", "Twin Cities Workers Defense Alliance"],
}
2021-06-02 00:48:57 +00:00
replacement_project_names = harvest_project_names | other_project_names
2021-04-28 15:14:39 +00:00
for preferred, alternatives in replacement_project_names.items():
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives.append(preferred)
alternatives = [item.lower() for item in alternatives]
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
2021-04-28 15:14:39 +00:00
# Replace irregular-but-known subproject ("Task") names with ones timetracking tools use.
# Development is the default and never specified.
subproject_names = {
"Contributing back to the community": ["contrib", "contributing", "contributions"],
"Not billed": ["nb"],
"Planning": ["plan", "meeting", "pm", "project management"],
}
for preferred, alternatives in subproject_names.items():
alternatives.append(preferred)
alternatives = [item.lower() for item in alternatives]
timelog.loc[timelog.subproject.str.lower().isin(alternatives), "subproject"] = preferred
2021-06-01 20:43:57 +00:00
# Condense duplicate entries by date, summing the minutes spent, and listing
# the first started and last recorded times for each task.
# The fillna is essential or we drop entries with blank ('None') projects.
tl = timelog.groupby(["date", timelog.project.fillna(""), timelog.subproject.fillna("Development"), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
# We're doing the final conversion to Harvest as a separate step because we
# want to factor out all of the above non-Harvest-specific logic.
latest = tl.recorded.max()
datest = str(tl.date.max())
# Separate Harvest from non-Harvest projects, and also filter out any blank
# projects, but save those too for a CSV of the excluded items.
hrvst = tl[tl.project.isin(harvest_project_names.keys())]
other = tl[tl.project.isin(other_project_names.keys())]
unknown = tl[~tl.project.isin(replacement_project_names.keys())]
harvest = hrvst.rename(columns = {'date': 'Date', 'project': 'Project', 'subproject': 'Task', 'description': 'Notes'})
harvest["Hours"] = harvest["time"]/60
harvest["First name"] = "Benjamin"
harvest["Last name"] = "Melançon"
project_client_mapping = settings.harvest_get_projects_clients_map()
harvest["Client"] = harvest["Project"].map(project_client_mapping)
harvest.drop(columns = ['started', 'recorded', 'time'], inplace=True)
2021-05-29 08:34:08 +00:00
if not debug:
harvest.to_csv('harvest-timesheets-' + datest + '.csv', index=False)
other.to_csv('personal-other-' + datest + '.csv', index=False)
unknown.to_csv('unknown-' + datest + '.csv', index=False)
settings.pomodoro_latest_recorded(latest)
else:
hrvst_grouped = hrvst.groupby("project").agg({"time": "sum"})["time"]/60
other_grouped = other.groupby("project").agg({"time": "sum"})["time"]/60
unknown_grouped = unknown.groupby("project").agg({"time": "sum"})["time"]/60
print("We do not write to CSV nor update the latest recorded setting when run interactively in the python shell.")