benjamin melançon
86cf06edd2
Not spite for Everyday AI, which gets EA, but the grift of artificial information out in the world today
289 lines
17 KiB
Python
289 lines
17 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import glob
|
|
import re
|
|
import sys
|
|
# Import our local settings management.
|
|
import settings
|
|
|
|
if hasattr(sys, 'ps1'):
|
|
import copy
|
|
debug = True
|
|
else:
|
|
debug = False
|
|
|
|
if settings.pomodoro_logfile():
|
|
# This works for one file:
|
|
timelog = pd.read_csv(settings.pomodoro_logfile())
|
|
else:
|
|
# For multiple files:
|
|
path = settings.pomodoro_logpath()
|
|
all_files = glob.glob(path + "*.csv")
|
|
|
|
li = []
|
|
|
|
for filename in all_files:
|
|
df = pd.read_csv(filename, index_col=None, header=0)
|
|
li.append(df)
|
|
|
|
timelog = pd.concat(li, axis=0, ignore_index=True)
|
|
|
|
if debug:
|
|
imported = copy.deepcopy(timelog)
|
|
|
|
timelog.drop_duplicates(inplace=True)
|
|
|
|
if debug:
|
|
nodupes = copy.deepcopy(timelog)
|
|
|
|
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
|
|
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
|
|
# need to drop the rows where the work task description is blank, which is
|
|
# coming in as not a number for reasons i'm not entirely clear on. Maybe
|
|
# because it's the last row of the spreadsheet? Anyway we cannot do anything
|
|
# with no data in the description, so drop them at the outset.
|
|
# We can allow no data in the 'intention' so define the three columns to check:
|
|
timelog = timelog.dropna(subset=['started', 'recorded', 'description'])
|
|
timelog = timelog.reset_index(drop=True)
|
|
|
|
if debug:
|
|
dropna = copy.deepcopy(timelog)
|
|
|
|
# For debugging, keep originals around.
|
|
timelog["orig_desc"] = timelog["description"]
|
|
timelog["orig_started"] = timelog["started"]
|
|
timelog["orig_recorded"] = timelog["recorded"]
|
|
|
|
# Clean up description before we go to work on it.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# Allow multiple entries to be put into one prompt by splitting with semicolon.
|
|
# TODO make this a flag since it's possible to use semicolons without meaning
|
|
# to make multiple task entries at once.
|
|
timelog["description"] = list(timelog["description"].str.split(";"))
|
|
timelog = timelog.explode("description").reset_index()
|
|
|
|
if debug:
|
|
mess = copy.deepcopy(timelog)
|
|
|
|
timelog["started"] = pd.to_datetime(timelog["started"], errors='coerce').dt.tz_convert("America/New_York")
|
|
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("America/New_York")
|
|
|
|
latest_recorded = settings.pomodoro_latest_recorded()
|
|
if latest_recorded:
|
|
timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)]
|
|
|
|
timelog["time"] = 30
|
|
# A pomodoro started before 3am Eastern time is considered to be a continuation
|
|
# of the day before, so we are, effectively, on West Coast time for determining
|
|
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
|
|
timelog["date"] = timelog["started"].dt.tz_convert("America/Los_Angeles").dt.date
|
|
timelog['date'] = pd.to_datetime(timelog['date'])
|
|
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
|
|
|
|
# If a project has been specified (task prefixed with a colon), then put the
|
|
# project in its own column.
|
|
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', n=1).str[0], None))
|
|
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', n=1).str[1], timelog['description']))
|
|
|
|
# Mid-work clean up of description and new project.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
timelog['project'] = timelog['project'].str.strip()
|
|
|
|
timelog['tmp_timeshift'] = timelog['description'].str.extract(r'^(\(.+?)\)', expand=False)
|
|
timelog['tmp_timeshift'] = timelog['tmp_timeshift'].str.strip().str.replace("(","", regex=False)
|
|
# In an ideal world we would use https://github.com/bear/parsedatetime or similar and
|
|
# even better figure out the right date for strings like "Monday" but eh this'll do.
|
|
timeshift_days = {
|
|
-1: ['one day ago', '1 day ago', 'yesterday'],
|
|
-2: ['two days ago', '2 days ago', 'day before yesterday', 'the day before yesterday'],
|
|
-3: ['three days ago', '3 days ago'],
|
|
-4: ['four days ago', '4 days ago'],
|
|
-5: ['five days ago', '5 days ago'],
|
|
}
|
|
for days, phrases in timeshift_days.items():
|
|
phrases.append(str(days))
|
|
timelog.loc[timelog.tmp_timeshift.str.lower().isin(phrases), "tmp_daysdelta"] = int(days)
|
|
timelog['tmp_daysdelta'] = timelog['tmp_daysdelta'].fillna(0)
|
|
timelog['date'] = timelog['date'] + pd.to_timedelta(timelog['tmp_daysdelta'], unit='D')
|
|
timelog.tmp_timeshift = '(' + timelog.tmp_timeshift + ')'
|
|
# timelog['description'] = (np.where(timelog['tmp_daysdelta'] > 0.0, timelog['description'].str.replace(timelog['tmp_timeshift'], ""), timelog['description']))
|
|
# timelog.description.replace(regex=r'(?i)' + timelog.tmp_timeshift, value="")
|
|
timelog['tmp_timeshift'] = timelog['tmp_timeshift'].fillna("")
|
|
timelog['description'] = timelog.apply(lambda x: x['description'].replace(x['tmp_timeshift'], ''), axis=1)
|
|
|
|
# Be sure to trim any whitespace before we regex for ending on asterisk number.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# If a multiplier has been provided (an asterisk and an integer at the end of a
|
|
# task), then multiply the time by it and remove it from the description.
|
|
# Ensure we're splitting on the same asterisk we found: Use the end of string
|
|
# signifier in the regular expression ($), and split from the right.
|
|
p = re.compile(r'\*\s*\d+$')
|
|
# On some systems, using np.where worked but others failed. Why it worked is
|
|
# unknown but why it failed is because numpy where evaluates all parts, even
|
|
# the parts that will never get used because the where clause does not apply!
|
|
# This caused the chained strings to fail because— no string.
|
|
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
|
|
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
|
|
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
|
|
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
|
|
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
|
|
timelog.drop(columns=['tmp_multiplier'], inplace=True)
|
|
|
|
# Clean up description again, after it has been sliced and diced.
|
|
timelog['description'] = timelog['description'].str.strip()
|
|
|
|
# Specific tasks are expanded from items in list on right into project-task combo on left.
|
|
compound_project_tasks = {
|
|
"Drutopia — Contributing back to the community": ["Drutopia contrib", "Drutopia contributing", "Drutopia contributions"],
|
|
"Find It Cambridge — Contributing back to the community": ["Find It Contrib"],
|
|
"Find It Cambridge — Planning": ["Find It project management"],
|
|
"Internal — Contributing back to the community": ["Contrib", "Agaric: contrib", "Contributing", "Agaric contrib", "Agaric contributions"],
|
|
"Internal — Conferences & Meetups": ["Conference", "Cons", "Camps", "Conferences", "meetup", "meetups"], # Note camp is not here because that maps to Near North camp
|
|
"Internal — Content": ["Agaric site content", "Agaric content", "content", "blog", "blogging", "writing", "content writing"],
|
|
"Internal — Documentation": ["documentation", "docs", "documenting"],
|
|
"Internal — Marketing": ["marketing", "Agaric marketing", "promotion", "Agaric promotion"],
|
|
"Internal — Other": ["other"],
|
|
"Internal — Overhead": ["overhead"],
|
|
"Internal — Personal Learning": ["Learning", "Personal learning"],
|
|
"Internal — Presentations": ["presentations", "presentation"],
|
|
"Internal — Network Engagement": ["Network Engagement", "NE", "network engagment", "Social media", "Network building", "Agaric network engagement", "AgaricNetwork Engagement", "Networking"],
|
|
"VHFA — Contributing back to the community": ["VHFA contrib"],
|
|
}
|
|
for preferred, alternatives in compound_project_tasks.items():
|
|
# We compare all alternatives to lower case versions, and add the
|
|
# preferred output to this list for that purpose, but note that what we use
|
|
# as preferred retains its capitalization.
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
|
|
|
|
# If a compound project was specified, break that out into a sub-project (in
|
|
# Harvest, we use Task, which is really task type, for this).
|
|
timelog['subproject'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', n=1).str[1], None))
|
|
timelog['project'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', n=1).str[0], timelog['project']))
|
|
|
|
# Replace irregular-but-known project names with ones timetracking tools use.
|
|
harvest_project_names = {
|
|
"Everyday AI Project": ["EverydayAI", "MIT Scheller Teacher Education Program (STEP) Lab", "Education Arcade", "Everyday AI", "Everday AI", "EA"],
|
|
"Boston Modern Orchestra Project": ["BMOP", "BMOP.org"],
|
|
"crla.org - Development & Support": ["CRLA.org upgrade", "CRLA", "CRLA upgrade", "California Rural Legal Assistance", "crla.org"],
|
|
"Cockrill Precision Products": ["Cockrill Corp", "Cockrill"],
|
|
"Contratados.org": ["Contratados", "RADCAT", "Research Action Design LLC"],
|
|
"Cultura Continued Support": ["Cultura", "MIT Cultura"],
|
|
"Drutopia": ["Drutopia improvements", "Drutopia overhead", "Drutapia"],
|
|
"EC Connect": ["eccconectcolorado.org", "Denver Econnect", "Denver Early Childhood", "ECconnect", "ECconnectColorado"],
|
|
"Eliot School Site & CRM": ["Eliot", "Eliot School"],
|
|
"encuentro 5 sites": ["Encuentro5", "e5", "Encuentro"],
|
|
"endTB website support": ["PIH", "Partners In Health", "endTB", "endtb.org", "endtb support"],
|
|
"ExperienceOlympic.com": ["Experience Olympic", "ExperienceOlympic", "Experience Olympia", "EO", "olympic", "ExpOlympic", "expeirenec olympic", "Experience Olympic LLC"],
|
|
"Family & Home": ["Family and Home", "Family home"],
|
|
"Find It Cambridge": ["Find It", "FIC", "Cambridge", "FindIt", "FindIt Cambridge"],
|
|
"GBCLT.org": ["Gertrude Brown Community Land Trust", "GBCLT", "GB", "Gertrude", "Gertrude Brown", "CLT", "Land Trust", "Community Land Trust", "GB CLT"],
|
|
"GEO Support": ["GEO", "GEO.coop", "Grassroots Economic Organizing"],
|
|
"Green Calendar": ["Action Information, Inc.", "Action Information", "GreenCalendar", "Action Info", "GC", "green cal", "greencal", "actioninfo", "actinfo", "Action Informaton", "AI"],
|
|
"HousingWorks": ["HousingWorks, Inc", "HousingWorks.net", "HW", "Housing Works", "hwerkz"],
|
|
"Immigrant Navigator": ["IFSI", "Immigrant Family Services"],
|
|
"Internal": ["Agaric", "Agaric internal"],
|
|
"Jacket2 Drupal Website upgrade": ["University of Pennsylvania", "UPenn", "Jacket2", "jacket"],
|
|
"Kick Big Polluters Out Website": ["Kick Big Polluters Out", "KBPO", "KOBP", "Rosa Luxemburg Stiftung", "Rosa Luxemburg"],
|
|
"Leads": ["Lead", "Agaric leads", "Lead followups"],
|
|
"lwcjustice.org": ["Longshore Workers Coalition", "LWC", "Longshore Workers", "lwcjustice", "lwc justice", "Longshore", "longshoreworkers"],
|
|
"MASS Continuous Improvement": ["MASS Design Group", "MASS", "MASS Design"],
|
|
"Metalwerx Maintenance": ["Metalwerx", "Metalwerx.com"],
|
|
"NICHQ Data Upgrade": ["NICHQ Data"],
|
|
"NICHQ Support": ["NICHQ", "NICHQ maintenance", "NICHQ Community"],
|
|
"NICHQ FL CMS LAN": ["FL CMS LAN", "flcmslan", "NICHQ FLCMSLAN"],
|
|
"North Carolina Housing Finance Agency": ["NCHFA", "NC HFA", "North Carolina", "NC Housing", "North Carolina Housing", "North Carolina HFA", "NC Housing Finance Agency"],
|
|
"OAG - Office of Opportunity and Achievement Gaps Task Force": ["Boston Public Schools", "BPS", "OAG"],
|
|
"Patient HM Brain Science Website": ["Patient HM", "patientHM"],
|
|
"PECE migration to Drupal 10": ["PECE", "PECE migration", "PECE D10", "Platform for Experimental, Collaborative Ethnography", "University of California, Irvine"],
|
|
"Portside": ["Portside.org Improvements 2020", "portside.org", "Portside support"],
|
|
"SCDTDP Collaboratory Data Site System Security": ["SCDTDP", "NICHQ SCDTDP", "NICHQ security"],
|
|
"Project GUTS/TWIG/Making Sense of Models": ["Teachers with GUTS", "TWIG", "GUTS", "Project GUTS"],
|
|
"SaharaReporters.com": ["Sahara Reporters", "Sahara", "SaharaReporters", "Sahara Network", "SR", "Sahara Reporter"],
|
|
"The Propaganda Site": ["TPS", "Propaganda Site", "The Propganda Site", "Murat & Clay"],
|
|
"Therapy Fidelity App - Development": ["Tulane", "Therapy Fidelity App"],
|
|
"Type Network": ["TypeNetwork", "TN", "Tyye Network", "Type Netork"],
|
|
"VHFA": ["Vermont Housing Finance Agency", "Vermont", "Vermont Housing"],
|
|
"Vulk redesign": ["Vulk", "Vulk.coop"],
|
|
"Virtual safe space (VSS) and eReferral Pathways (eRPW)": ["UNICEF", "VSS", "eRPW", "VSS+eRPW", "VSS + eRPW", "VSS and ERPW", "unicef code review", "virtual safe spaces", "ereferral", "laaha", "laaha.org"],
|
|
"Visions Unite": ["VU", "VisionsUnite"],
|
|
"Winchester Find It demo site": ["Winchester", "Town Common", "Winchester Town Common", "Find It Winchester", "findit Winchester", "towncommon"],
|
|
"C-Team support": ["ZEIT", "ZEIT ONLINE", "ZEIT Upgrade", "Zeit D9"],
|
|
}
|
|
other_project_names = {
|
|
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support", "NN camp defense", "NN camp", "NN defense", "Near North camp defense", "Camp", "Near North", "Encampment Defense"],
|
|
"Personal": ["Personal/external", "Personal / external", "External"],
|
|
"Tzedakah": ["Community support"],
|
|
"PWGD": ["People Who Give a Damn", "PWGD Inc"],
|
|
"Workers Defense Alliance": ["WDA", "Alliance", "Twin Cities Workers Defense Alliance"],
|
|
"Solidarity Network": ["SolNet"],
|
|
}
|
|
|
|
replacement_project_names = harvest_project_names | other_project_names
|
|
|
|
for preferred, alternatives in replacement_project_names.items():
|
|
# We compare all alternatives to lower case versions, and add the
|
|
# preferred output to this list for that purpose, but note that what we use
|
|
# as preferred retains its capitalization.
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
|
|
|
|
# Replace irregular-but-known subproject ("Task") names with ones timetracking tools use.
|
|
# Development is the default and never specified.
|
|
subproject_names = {
|
|
"Contributing back to the community": ["contrib", "contributing", "contributions"],
|
|
"Not billed": ["nb"],
|
|
"Planning": ["plan", "meeting", "pm", "project management", "plannng"],
|
|
"Quality Assurance": ["qa", "quality"],
|
|
}
|
|
for preferred, alternatives in subproject_names.items():
|
|
alternatives.append(preferred)
|
|
alternatives = [item.lower() for item in alternatives]
|
|
timelog.loc[timelog.subproject.str.lower().isin(alternatives), "subproject"] = preferred
|
|
# TODO flag when task/subproject names are not known, because otherwise they
|
|
# get force-created on import.
|
|
|
|
# Default task for Leads project should be 'Leads' (not Development as will be filled in below).
|
|
timelog.loc[(timelog.project=="Leads") & timelog.subproject.isna(), "subproject"] = "Leads"
|
|
# The above could also have been done in the compound_project_tasks
|
|
# Leaving this here as an example.
|
|
|
|
# Condense duplicate entries by date, summing the minutes spent, and listing
|
|
# the first started and last recorded times for each task.
|
|
# The fillna is essential or we drop entries with blank ('None') projects.
|
|
tl = timelog.groupby(["date", timelog.project.fillna(""), timelog.subproject.fillna("Development"), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
|
|
|
|
# We're doing the final conversion to Harvest as a separate step because we
|
|
# want to factor out all of the above non-Harvest-specific logic.
|
|
|
|
latest = tl.recorded.max()
|
|
datest = tl.date.max().strftime('%Y-%m-%d')
|
|
|
|
# Separate Harvest from non-Harvest projects, and also filter out any blank
|
|
# projects, but save those too for a CSV of the excluded items.
|
|
hrvst = tl[tl.project.isin(harvest_project_names.keys())]
|
|
other = tl[tl.project.isin(other_project_names.keys())]
|
|
unknown = tl[~tl.project.isin(replacement_project_names.keys())]
|
|
|
|
harvest = hrvst.rename(columns = {'date': 'Date', 'project': 'Project', 'subproject': 'Task', 'description': 'Notes'})
|
|
harvest["Hours"] = harvest["time"]/60
|
|
harvest["First name"] = "Benjamin"
|
|
harvest["Last name"] = "Melançon"
|
|
project_client_mapping = settings.harvest_get_projects_clients_map()
|
|
harvest["Client"] = harvest["Project"].map(project_client_mapping)
|
|
harvest.drop(columns = ['started', 'recorded', 'time'], inplace=True)
|
|
|
|
if not debug:
|
|
harvest.to_csv('harvest-timesheets-' + datest + '.csv', index=False)
|
|
other.to_csv('personal-other-' + datest + '.csv', index=False)
|
|
unknown.to_csv('unknown-' + datest + '.csv', index=False)
|
|
settings.pomodoro_latest_recorded(latest)
|
|
else:
|
|
hrvst_grouped = hrvst.groupby("project").agg({"time": "sum"})["time"]/60
|
|
other_grouped = other.groupby("project").agg({"time": "sum"})["time"]/60
|
|
unknown_grouped = unknown.groupby("project").agg({"time": "sum"})["time"]/60
|
|
print("We do not write to CSV nor update the latest recorded setting when run interactively in the python shell.")
|