import pandas as pd import numpy as np import glob import re import sys # Import our local settings management. import settings if hasattr(sys, 'ps1'): import copy debug = True else: debug = False if settings.pomodoro_logfile(): # This works for one file: timelog = pd.read_csv(settings.pomodoro_logfile()) else: # For multiple files: path = settings.pomodoro_logpath() all_files = glob.glob(path + "*.csv") li = [] for filename in all_files: df = pd.read_csv(filename, index_col=None, header=0) li.append(df) timelog = pd.concat(li, axis=0, ignore_index=True) if debug: imported = copy.deepcopy(timelog) timelog.drop_duplicates(inplace=True) if debug: nodupes = copy.deepcopy(timelog) # Dump bad data. The real solution here is to get rid of the damned 'Cancel' # button on the Pomodoro Prompt dialog, but i don't know how to do that, so we # need to drop the rows where the work task description is blank, which is # coming in as not a number for reasons i'm not entirely clear on. Maybe # because it's the last row of the spreadsheet? Anyway we cannot do anything # with no data in the description, so drop them at the outset. # We can allow no data in the 'intention' so define the three columns to check: timelog = timelog.dropna(subset=['started', 'recorded', 'description']) timelog = timelog.reset_index(drop=True) if debug: dropna = copy.deepcopy(timelog) # For debugging, keep originals around. timelog["orig_desc"] = timelog["description"] timelog["orig_started"] = timelog["started"] timelog["orig_recorded"] = timelog["recorded"] # Clean up description before we go to work on it. timelog['description'] = timelog['description'].str.strip() # Allow multiple entries to be put into one prompt by splitting with semicolon. # TODO make this a flag since it's possible to use semicolons without meaning # to make multiple task entries at once. timelog["description"] = list(timelog["description"].str.split(";")) timelog = timelog.explode("description").reset_index() if debug: mess = copy.deepcopy(timelog) timelog["started"] = pd.to_datetime(timelog["started"], errors='coerce').dt.tz_convert("US/Eastern") timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern") latest_recorded = settings.pomodoro_latest_recorded() if latest_recorded: timelog = timelog[timelog.recorded > pd.to_datetime(latest_recorded)] timelog["time"] = 30 # A pomodoro started before 3am Eastern time is considered to be a continuation # of the day before, so we are, effectively, on West Coast time for determining # the day we want to associate a time entry with. PomodoroPrompt saves as UTC. timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date timelog['date'] = pd.to_datetime(timelog['date']) timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name() # If a project has been specified (task prefixed with a colon), then put the # project in its own column. timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None)) timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description'])) # Mid-work clean up of description and new project. timelog['description'] = timelog['description'].str.strip() timelog['project'] = timelog['project'].str.strip() timelog['tmp_timeshift'] = timelog['description'].str.extract(r'^(\(.+)\)', expand=False) timelog['tmp_timeshift'] = timelog['tmp_timeshift'].str.strip().str.replace("(","", regex=False) # In an ideal world we would use https://github.com/bear/parsedatetime or similar and # even better figure out the right date for strings like "Monday" but eh this'll do. timeshift_days = { -1: ['one day ago', '1 day ago', 'yesterday'], -2: ['two days ago', '2 days ago', 'day before yesterday', 'the day before yesterday'], -3: ['three days ago', '3 days ago'], -4: ['four days ago', '4 days ago'], -5: ['five days ago', '5 days ago'], } for days, phrases in timeshift_days.items(): phrases.append(str(days)) timelog.loc[timelog.tmp_timeshift.str.lower().isin(phrases), "tmp_daysdelta"] = int(days) timelog['tmp_daysdelta'] = timelog['tmp_daysdelta'].fillna(0) timelog['date'] = timelog['date'] + pd.to_timedelta(timelog['tmp_daysdelta'], unit='D') timelog.tmp_timeshift = '(' + timelog.tmp_timeshift + ')' # timelog['description'] = (np.where(timelog['tmp_daysdelta'] > 0.0, timelog['description'].str.replace(timelog['tmp_timeshift'], ""), timelog['description'])) # timelog.description.replace(regex=r'(?i)' + timelog.tmp_timeshift, value="") timelog['tmp_timeshift'] = timelog['tmp_timeshift'].fillna("") timelog['description'] = timelog.apply(lambda x: x['description'].replace(x['tmp_timeshift'], ''), axis=1) # If a multiplier has been provided (an asterisk and an integer at the end of a # task), then multiply the time by it and remove it from the description. # Ensure we're splitting on the same asterisk we found: Use the end of string # signifier in the regular expression ($), and split from the right. p = re.compile(r'\*\s*\d$') # On some systems, using np.where worked but others failed. Why it worked is # unknown but why it failed is because numpy where evaluates all parts, even # the parts that will never get used because the where clause does not apply! # This caused the chained strings to fail because— no string. # timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1)) # timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description'])) timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1) timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x) timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int) timelog.drop(columns=['tmp_multiplier'], inplace=True) # Clean up description again, after it has been sliced and diced. timelog['description'] = timelog['description'].str.strip() # Specific tasks are expanded from items in list on right into project-task combo on left. compound_project_tasks = { "Drutopia — Contributing back to the community": ["Drutopia contrib", "Drutopia contributing", "Drutopia contributions"], "Find It Cambridge — Contributing back to the community": ["Find It Contrib"], "Find It Cambridge — Planning": ["Find It project management"], "Internal — Contributing back to the community": ["Contrib", "Agaric: contrib", "Contributing", "Agaric contrib", "Agaric contributions"], "Internal — Conferences & Meetups": ["Conference", "Camps", "Camp", "Conferences", "meetup", "meetups"], "Internal — Documentation": ["documentation", "docs", "documenting"], "Internal — Personal Learning": ["Learning", "Personal learning"], "Internal — Presentations": ["presentations", "presentation"], "Internal — Network Engagement": ["Network Engagement", "network engagement", "Network engagment", "Social media", "Network building", "Agaric network engagement"], "Internal — Content": ["Agaric site content", "Agaric content"], "VHFA — Contributing back to the community": ["VHFA contrib"], } for preferred, alternatives in compound_project_tasks.items(): # We compare all alternatives to lower case versions, and add the # preferred output to this list for that purpose, but note that what we use # as preferred retains its capitalization. alternatives.append(preferred) alternatives = [item.lower() for item in alternatives] timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred # If a compound project was specified, break that out into a sub-project (in # Harvest, we use Task, which is really task type, for this. timelog['subproject'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', 1).str[1], None)) timelog['project'] = (np.where(timelog['project'].str.contains(' — '), timelog['project'].str.split(' — ', 1).str[0], timelog['project'])) # Replace irregular-but-known project names with ones timetracking tools use. harvest_project_names = { "Boston Modern Orchestra Project": ["BMOP", "BMOP.org"], "CRLA.org upgrade": ["CRLA", "CRLA upgrade"], "Cockrill Precision Products": ["Cockrill Corp", "Cockrill"], "Cultura Continued Support": ["Cultura", "MIT Cultura"], "Drutopia": ["Drutopia improvements", "Drutopia overhead"], "EC Connect": ["eccconectcolorado.org", "Denver Econnect", "Denver Early Childhood", "ECconnect", "ECconnectColorado"], "Eliot School Site & CRM": ["Eliot", "Eliot School"], "encuentro 5 sites": ["Encuentro5", "e5", "Encuentro"], "Family & Home": ["Family and Home", "Family home"], "Find It Cambridge": ["Find It", "FIC", "Cambridge", "FindIt", "FindIt Cambridge"], "GEO Support": ["GEO", "GEO.coop", "Grassroots Economic Organizing"], "Immigrant Navigator": ["IFSI", "Immigrant Family Services"], "Internal": ["Agaric", "Agaric internal"], "Leads": ["Lead", "Agaric leads", "Lead followups"], "MASS Continuous Improvement": ["MASS Design Group", "MASS", "MASS Design"], "Metalwerx Maintenance": ["Metalwerx", "Metalwerx.com"], "NICHQ Data Upgrade": ["NICHQ Data"], "NICHQ Support": ["NICHQ", "NICHQ maintenance", "NICHQ Community"], "NICHQ FL CMS LAN": ["FL CMS LAN", "flcmslan", "NICHQ FLCMSLAN"], "OAG - Office of Opportunity and Achievement Gaps Task Force": ["Boston Public Schools", "BPS", "OAG"], "Patient HM Brain Science Website": ["Patient HM", "patientHM"], "Portside": ["Portside.org Improvements 2020", "portside.org", "Portside support"], "SCDTDP Collaboratory Data Site System Security": ["SCDTDP", "NICHQ SCDTDP", "NICHQ security"], "Project GUTS/TWIG/Making Sense of Models": ["Teachers with GUTS", "TWIG", "GUTS", "Project GUTS"], "The Propaganda Site": ["TPS", "Propaganda Site", "The Propganda Site", "Murat & Clay"], "VHFA": ["Vermont Housing Finance Agency", "Vermont", "Vermont Housing"], "Vulk redesign": ["Vulk", "Vulk.coop"], "Visions Unite": ["VU", "VisionsUnite"], "C-Team support": ["ZEIT", "ZEIT ONLINE", "ZEIT Upgrade", "Zeit D9"], } other_project_names = { "Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support", "NN camp defense", "NN camp", "NN defense", "Near North camp defense", "Camp", "Near North"], "Personal": ["Personal/external", "Personal / external", "External"], "Tzedakah": ["Community support"], "PWGD": ["People Who Give a Damn", "PWGD Inc"], "Workers Defense Alliance": ["WDA", "Alliance", "Twin Cities Workers Defense Alliance"], } replacement_project_names = harvest_project_names | other_project_names for preferred, alternatives in replacement_project_names.items(): # We compare all alternatives to lower case versions, and add the # preferred output to this list for that purpose, but note that what we use # as preferred retains its capitalization. alternatives.append(preferred) alternatives = [item.lower() for item in alternatives] timelog.loc[timelog.project.str.lower().isin(alternatives), "project"] = preferred # Replace irregular-but-known subproject ("Task") names with ones timetracking tools use. # Development is the default and never specified. subproject_names = { "Contributing back to the community": ["contrib", "contributing", "contributions"], "Not billed": ["nb"], "Planning": ["plan", "meeting", "pm", "project management"], } for preferred, alternatives in subproject_names.items(): alternatives.append(preferred) alternatives = [item.lower() for item in alternatives] timelog.loc[timelog.subproject.str.lower().isin(alternatives), "subproject"] = preferred # Condense duplicate entries by date, summing the minutes spent, and listing # the first started and last recorded times for each task. # The fillna is essential or we drop entries with blank ('None') projects. tl = timelog.groupby(["date", timelog.project.fillna(""), timelog.subproject.fillna("Development"), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index() # We're doing the final conversion to Harvest as a separate step because we # want to factor out all of the above non-Harvest-specific logic. latest = tl.recorded.max() datest = tl.date.max().strftime('%Y-%m-%d') # Separate Harvest from non-Harvest projects, and also filter out any blank # projects, but save those too for a CSV of the excluded items. hrvst = tl[tl.project.isin(harvest_project_names.keys())] other = tl[tl.project.isin(other_project_names.keys())] unknown = tl[~tl.project.isin(replacement_project_names.keys())] harvest = hrvst.rename(columns = {'date': 'Date', 'project': 'Project', 'subproject': 'Task', 'description': 'Notes'}) harvest["Hours"] = harvest["time"]/60 harvest["First name"] = "Benjamin" harvest["Last name"] = "Melançon" project_client_mapping = settings.harvest_get_projects_clients_map() harvest["Client"] = harvest["Project"].map(project_client_mapping) harvest.drop(columns = ['started', 'recorded', 'time'], inplace=True) if not debug: harvest.to_csv('harvest-timesheets-' + datest + '.csv', index=False) other.to_csv('personal-other-' + datest + '.csv', index=False) unknown.to_csv('unknown-' + datest + '.csv', index=False) settings.pomodoro_latest_recorded(latest) else: hrvst_grouped = hrvst.groupby("project").agg({"time": "sum"})["time"]/60 other_grouped = other.groupby("project").agg({"time": "sum"})["time"]/60 unknown_grouped = unknown.groupby("project").agg({"time": "sum"})["time"]/60 print("We do not write to CSV nor update the latest recorded setting when run interactively in the python shell.")