parse-timelogs-for-upload/pomodoro_to_harvest.py

315 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
import glob
import re
import sys
# Import our local settings management.
import settings
if hasattr(sys, 'ps1'):
import copy
debug = True
else:
debug = False
if settings.pomodoro_logfile():
# This works for one file:
timelog = pd.read_csv(settings.pomodoro_logfile())
if not timelog:
sys.exit("No logfile found at location specified for logfile.")
elif settings.pomodoro_logpath():
# For multiple files:
path = settings.pomodoro_logpath()
all_files = glob.glob(path + "*.csv")
if not all_files:
sys.exit("No folder at location specified for logpath.")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
timelog = pd.concat(li, axis=0, ignore_index=True)
else:
sys.exit("You must set either a logfile (single CSV) or a logpath (folder containing CSVs) in your settings.ini to use this script.")
if debug:
imported = copy.deepcopy(timelog)
timelog.drop_duplicates(inplace=True)
if debug:
nodupes = copy.deepcopy(timelog)
# Dump bad data. The real solution here is to get rid of the damned 'Cancel'
# button on the Pomodoro Prompt dialog, but i don't know how to do that, so we
# need to drop the rows where the work task description is blank, which is
# coming in as not a number for reasons i'm not entirely clear on. Maybe
# because it's the last row of the spreadsheet? Anyway we cannot do anything
# with no data in the description, so drop them at the outset.
# We can allow no data in the 'intention' so define the three columns to check:
timelog = timelog.dropna(subset=['started', 'recorded', 'description'])
timelog = timelog.reset_index(drop=True)
if debug:
dropna = copy.deepcopy(timelog)
# For debugging, keep originals around.
timelog["orig_desc"] = timelog["description"]
timelog["orig_started"] = timelog["started"]
timelog["orig_recorded"] = timelog["recorded"]
# Clean up description before we go to work on it.
timelog['description'] = timelog['description'].str.strip()
# Allow multiple entries to be put into one prompt by splitting with semicolon.
# TODO make this a flag since it's possible to use semicolons without meaning
# to make multiple task entries at once.
timelog["description"] = list(timelog["description"].str.split(";"))
timelog = timelog.explode("description").reset_index()
if debug:
mess = copy.deepcopy(timelog)
timelog["started"] = pd.to_datetime(timelog["started"], errors='coerce').dt.tz_convert("America/New_York")
timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("America/New_York")
latest_recorded = settings.pomodoro_latest_recorded()
if latest_recorded:
timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)]
timelog["time"] = 30
# A pomodoro started before 3am Eastern time is considered to be a continuation
# of the day before, so we are, effectively, on West Coast time for determining
# the day we want to associate a time entry with. PomodoroPrompt saves as UTC.
timelog["date"] = timelog["started"].dt.tz_convert("America/Los_Angeles").dt.date
timelog['date'] = pd.to_datetime(timelog['date'])
timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name()
# If a project has been specified (task prefixed with a colon), then put the
# project in its own column.
timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', n=1).str[0], None))
timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', n=1).str[1], timelog['description']))
# Mid-work clean up of description and new project.
timelog['description'] = timelog['description'].str.strip()
timelog['project'] = timelog['project'].str.strip()
timelog['tmp_timeshift'] = timelog['description'].str.extract(r'^(\(.+?)\)', expand=False)
timelog['tmp_timeshift'] = timelog['tmp_timeshift'].str.strip().str.replace("(","", regex=False)
# In an ideal world we would use https://github.com/bear/parsedatetime or similar and
# even better figure out the right date for strings like "Monday" but eh this'll do.
timeshift_days = {
-1: ['one day ago', '1 day ago', 'yesterday'],
-2: ['two days ago', '2 days ago', 'day before yesterday', 'the day before yesterday'],
-3: ['three days ago', '3 days ago'],
-4: ['four days ago', '4 days ago'],
-5: ['five days ago', '5 days ago'],
}
for days, phrases in timeshift_days.items():
phrases.append(str(days))
timelog.loc[timelog.tmp_timeshift.str.lower().isin(phrases), "tmp_daysdelta"] = int(days)
timelog['tmp_daysdelta'] = timelog['tmp_daysdelta'].fillna(0)
timelog['date'] = timelog['date'] + pd.to_timedelta(timelog['tmp_daysdelta'], unit='D')
timelog.tmp_timeshift = '(' + timelog.tmp_timeshift + ')'
# timelog['description'] = (np.where(timelog['tmp_daysdelta'] > 0.0, timelog['description'].str.replace(timelog['tmp_timeshift'], ""), timelog['description']))
# timelog.description.replace(regex=r'(?i)' + timelog.tmp_timeshift, value="")
timelog['tmp_timeshift'] = timelog['tmp_timeshift'].fillna("")
timelog['description'] = timelog.apply(lambda x: x['description'].replace(x['tmp_timeshift'], ''), axis=1)
# Be sure to trim any whitespace before we regex for ending on asterisk number.
timelog['description'] = timelog['description'].str.strip()
# If a multiplier has been provided (an asterisk and an integer at the end of a
# task), then multiply the time by it and remove it from the description.
# Ensure we're splitting on the same asterisk we found: Use the end of string
# signifier in the regular expression ($), and split from the right.
p = re.compile(r'\*\s*\d+$')
# On some systems, using np.where worked but others failed. Why it worked is
# unknown but why it failed is because numpy where evaluates all parts, even
# the parts that will never get used because the where clause does not apply!
# This caused the chained strings to fail because— no string.
# timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1))
# timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description']))
timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1)
timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x)
timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int)
timelog.drop(columns=['tmp_multiplier'], inplace=True)
# Clean up description again, after it has been sliced and diced.
timelog['description'] = timelog['description'].str.strip()
# Specific tasks are expanded from items in list on right into project-task combo on left.
compound_project_tasks = {
"Drutopia — Contributing back to the community": ["Drutopia contrib", "Drutopia contributing", "Drutopia contributions"],
"Find It Cambridge — Contributing back to the community": ["Find It Contrib"],
"Find It Cambridge — Planning": ["Find It project management"],
"Internal — Contributing back to the community": ["Contrib", "Agaric: contrib", "Contributing", "Agaric contrib", "Agaric contributions"],
"Internal — Conferences & Meetups": ["Conference", "Cons", "Con", "Camps", "Camp", "Conferences", "meetup", "meetups"],
"Internal — Content": ["Agaric site content", "Agaric content", "content", "blog", "blogging", "writing", "content writing"],
"Internal — Documentation": ["documentation", "docs", "documenting"],
"Internal — Marketing": ["marketing", "Agaric marketing", "promotion", "Agaric promotion"],
"Internal — Other": ["other"],
"Internal — Overhead": ["overhead"],
"Internal — Planning": ["planning", "plan"],
"Internal — Personal Learning": ["Learning", "Personal learning", "learn"],
"Internal — Presentations": ["presentations", "presentation"],
"Internal — Paid time off": ["PTO", "Paid Time Off", "Agaric PTO"],
"Internal — Network Engagement": ["Network Engagement", "NE", "network engagment", "Social media", "Network building", "Agaric network engagement", "AgaricNetwork Engagement", "Networking"],
"VHFA — Contributing back to the community": ["VHFA contrib"],
}
for preferred, alternatives in compound_project_tasks.items():
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives.append(preferred)
alternatives = [item.lower() for item in alternatives]
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
# Replace single dashes, en-dashes, and attempts at em-dashes with emdashes
# because apparently i really only accepted emdashes for all this time and
# never noticed because i use them so consistently.
timelog['project'] = timelog['project'].str.replace('---', '', regex=False);
timelog['project'] = timelog['project'].str.replace('--', '', regex=False);
timelog['project'] = timelog['project'].str.replace(' ', '', regex=False);
timelog['project'] = timelog['project'].str.replace(' - ', '', regex=False);
# If a compound project was specified, break that out into a sub-project (in
# Harvest, we use Task, which is really task type, for this).
timelog['subproject'] = (np.where(timelog['project'].str.contains(''), timelog['project'].str.split('', n=1).str[1], None))
timelog['project'] = (np.where(timelog['project'].str.contains(''), timelog['project'].str.split('', n=1).str[0], timelog['project']))
# Trim any surrounding whitespace from final project and sub-project/task.
timelog['subproject'] = timelog['subproject'].str.strip()
timelog['project'] = timelog['project'].str.strip()
# Replace irregular-but-known project names with ones timetracking tools use.
harvest_project_names = {
"Everyday AI Project": ["EverydayAI", "MIT Scheller Teacher Education Program (STEP) Lab", "Education Arcade", "Everyday AI", "Everday AI", "EA"],
"Boston Modern Orchestra Project": ["BMOP", "BMOP.org"],
"crla.org - Development & Support": ["CRLA.org upgrade", "CRLA", "CRLA upgrade", "California Rural Legal Assistance", "crla.org"],
"Cockrill Precision Products": ["Cockrill Corp", "Cockrill"],
"Contratados Migration and Upgrade to D10": ["Contratados", "Contratados.org", "RADCAT", "RAD", "Research Action Design LLC", "Contradados", "Contratodos"],
"Cultura Continued Support": ["Cultura", "MIT Cultura"],
"Drutopia": ["Drutopia improvements", "Drutopia overhead", "Drutapia"],
"EC Connect": ["eccconectcolorado.org", "Denver Econnect", "Denver Early Childhood", "ECconnect", "ECconnectColorado"],
"Eliot School Site & CRM": ["Eliot", "Eliot School"],
"encuentro 5 sites": ["Encuentro5", "e5", "Encuentro"],
"endTB website support": ["PIH", "Partners In Health", "endTB", "endtb.org", "endtb support"],
"ExperienceOlympic.com": ["Experience Olympic", "ExperienceOlympic", "Experience Olympia", "EO", "olympic", "ExpOlympic", "expeirenec olympic", "Experience Olympic LLC"],
"Family & Home": ["Family and Home", "Family home"],
"Find It Cambridge": ["Find It", "FIC", "Cambridge", "FindIt", "FindIt Cambridge"],
"GBCLT.org": ["Gertrude Brown Community Land Trust", "GBCLT", "GB", "Gertrude", "Gertrude Brown", "CLT", "Land Trust", "Community Land Trust", "GB CLT"],
"GEO Support": ["GEO", "GEO.coop", "Grassroots Economic Organizing"],
"Green Calendar": ["Action Information, Inc.", "Action Information", "GreenCalendar", "Action Info", "GC", "green cal", "greencal", "actioninfo", "actinfo", "Action Informaton", "AI"],
"HousingWorks": ["HousingWorks, Inc", "HousingWorks.net", "HW", "Housing Works", "hwerkz"],
"hoptothebeat and beantowncamp": ["Hop To The Beat", "hoptothebeat", "beanttowncamp", "hopto", "beantown", "HTTB"],
"Immigrant Navigator": ["IFSI", "Immigrant Family Services"],
"Irish Federation of University Teachers": ["IFUT", "Irish Teachers", "IE"],
"Internal": ["Agaric", "Agaric internal"],
"Jacket2 Drupal Website upgrade": ["University of Pennsylvania", "UPenn", "Jacket2", "jacket"],
"Kalamuna subcontracting": ["Kalamuna Inc", "Kalamuna"],
"Kick Big Polluters Out Website": ["Kick Big Polluters Out", "KBPO", "KOBP", "Rosa Luxemburg Stiftung", "Rosa Luxemburg"],
"Leads": ["Lead", "Agaric leads", "Lead followups"],
"lwcjustice.org": ["Longshore Workers Coalition", "LWC", "Longshore Workers", "lwcjustice", "lwc justice", "Longshore", "longshoreworkers"],
"MASS Continuous Improvement": ["MASS Design Group", "MASS", "MASS Design"],
"monasteriesoftheheart.org support": ["Monasteries of the Heart", "MOH", "monastery"],
"Metalwerx Maintenance": ["Metalwerx", "Metalwerx.com"],
"NICHQ Data Upgrade": ["NICHQ Data"],
"NICHQ Support": ["NICHQ", "NICHQ maintenance", "NICHQ Community"],
"NICHQ FL CMS LAN": ["FL CMS LAN", "flcmslan", "NICHQ FLCMSLAN"],
"North Carolina Housing Finance Agency": ["NCHFA", "NC HFA", "North Carolina", "NC Housing", "North Carolina Housing", "North Carolina HFA", "NC Housing Finance Agency"],
"OAG - Office of Opportunity and Achievement Gaps Task Force": ["Boston Public Schools", "BPS", "OAG"],
"Patient HM Brain Science Website": ["Patient HM", "patientHM"],
"PDX Contemporary Art Upgrade to D10": ["PDX Contemporary Art", "PDX Art", "Portland Art", "PDX Art Museum", "PDX"],
"PECE migration to Drupal 10": ["PECE", "PECE migration", "PECE D10", "Platform for Experimental, Collaborative Ethnography", "University of California, Irvine"],
"Plants Earth Life": ["PlantsEarthLife.org", "PEL", "PlantsEarthLife"],
"Portside": ["Portside.org Improvements 2020", "portside.org", "Portside support"],
"SCDTDP Collaboratory Data Site System Security": ["SCDTDP", "NICHQ SCDTDP", "NICHQ security"],
"Making Sense of Models (Teachers with Guts)": ["Teachers with GUTS", "TWIG", "GUTS", "Project GUTS", "Making Sense of Models", "Project GUTS/TWIG/Making Sense of Models"],
"SaharaReporters.com": ["Sahara Reporters", "Sahara", "SaharaReporters", "Sahara Network", "SR", "Sahara Reporter"],
"The Propaganda Site": ["TPS", "Propaganda Site", "The Propganda Site", "Murat & Clay"],
"Therapy Fidelity App - Development": ["Tulane", "Therapy Fidelity App"],
"Type Network": ["TypeNetwork", "TN", "Tyye Network", "Type Netork"],
"VHFA": ["Vermont Housing Finance Agency", "Vermont", "Vermont Housing"],
"Vulk redesign": ["Vulk", "Vulk.coop"],
"Virtual safe space (VSS) and eReferral Pathways (eRPW)": ["UNICEF", "VSS", "eRPW", "VSS+eRPW", "VSS + eRPW", "VSS and ERPW", "unicef code review", "virtual safe spaces", "ereferral", "laaha", "laaha.org"],
"Visions Unite": ["VU", "VisionsUnite"],
"Winchester Find It demo site": ["Winchester", "Town Common", "Winchester Town Common", "Find It Winchester", "findit Winchester", "towncommon"],
"C-Team support": ["ZEIT", "ZEIT ONLINE", "ZEIT Upgrade", "Zeit D9"],
}
other_project_names = {
"Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support", "NN camp defense", "NN camp", "NN defense", "Near North camp defense", "Near North", "Encampment Defense"],
"Personal": ["Personal/external", "Personal / external", "External"],
"Tzedakah": ["Community support"],
"PWGD": ["People Who Give a Damn", "PWGD Inc"],
"Workers Defense Alliance": ["WDA", "Alliance", "Twin Cities Workers Defense Alliance"],
"Solidarity Network": ["SolNet"],
}
replacement_project_names = harvest_project_names | other_project_names
for preferred, alternatives in replacement_project_names.items():
# We compare all alternatives to lower case versions, and add the
# preferred output to this list for that purpose, but note that what we use
# as preferred retains its capitalization.
alternatives.append(preferred)
alternatives = [item.lower() for item in alternatives]
timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred
# Replace irregular-but-known subproject ("Task") names with ones timetracking tools use.
# Development is the default and never specified.
subproject_names = {
"Contributing back to the community": ["contrib", "contributing", "contributions"],
"Not billed": ["nb"],
"Paid time off": ["PTO"],
"Planning": ["plan", "meeting", "pm", "project management", "plannng"],
"Quality Assurance": ["qa", "quality"],
}
for preferred, alternatives in subproject_names.items():
alternatives.append(preferred)
alternatives = [item.lower() for item in alternatives]
timelog.loc[timelog.subproject.str.lower().isin(alternatives), "subproject"] = preferred
# TODO flag when task/subproject names are not known, because otherwise they
# get force-created on import.
# Default task for Leads project should be 'Leads' (not Development as will be filled in below).
timelog.loc[(timelog.project=="Leads") & timelog.subproject.isna(), "subproject"] = "Leads"
# The above could also have been done in the compound_project_tasks
# Leaving this here as an example.
# Condense duplicate entries by date, summing the minutes spent, and listing
# the first started and last recorded times for each task.
# The fillna is essential or we drop entries with blank ('None') projects.
tl = timelog.groupby(["date", timelog.project.fillna(""), timelog.subproject.fillna("Development"), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index()
# We're doing the final conversion to Harvest as a separate step because we
# want to factor out all of the above non-Harvest-specific logic.
latest = tl.recorded.max()
datest = tl.date.max().strftime('%Y-%m-%d')
# Separate Harvest from non-Harvest projects, and also filter out any blank
# projects, but save those too for a CSV of the excluded items.
hrvst = tl[tl.project.isin(harvest_project_names.keys())]
other = tl[tl.project.isin(other_project_names.keys())]
unknown = tl[~tl.project.isin(replacement_project_names.keys())]
harvest = hrvst.rename(columns = {'date': 'Date', 'project': 'Project', 'subproject': 'Task', 'description': 'Notes'})
harvest["Hours"] = harvest["time"]/60
harvest["First name"] = settings.harvest_get_profile_firstname()
harvest["Last name"] = settings.harvest_get_profile_lastname()
project_client_mapping = settings.harvest_get_projects_clients_map()
harvest["Client"] = harvest["Project"].map(project_client_mapping)
harvest.drop(columns = ['started', 'recorded', 'time'], inplace=True)
if not debug:
harvest.to_csv('harvest-timesheets-' + datest + '.csv', index=False)
other.to_csv('personal-other-' + datest + '.csv', index=False)
unknown.to_csv('unknown-' + datest + '.csv', index=False)
settings.pomodoro_latest_recorded(latest)
else:
hrvst_grouped = hrvst.groupby("project").agg({"time": "sum"})["time"]/60
other_grouped = other.groupby("project").agg({"time": "sum"})["time"]/60
unknown_grouped = unknown.groupby("project").agg({"time": "sum"})["time"]/60
print("We do not write to CSV nor update the latest recorded setting when run interactively in the python shell.")