From 68ac64036a3040fce0537f765eb1c88570be87d2 Mon Sep 17 00:00:00 2001
From: Xitang <xz522@cornell.edu>
Date: Thu, 6 Jul 2023 00:11:51 -0700
Subject: [PATCH] Create getDescriptionsLineIdx and refactor
 getFirstBulletPointLineIdx - #5

---
 .../extract-education.ts                      | 17 ++----
 .../extract-project.ts                        | 13 ++---
 .../extract-skills.ts                         | 14 ++---
 .../extract-work-experience.ts                | 14 ++---
 .../lib/bullet-points.ts                      | 55 ++++++++++++-------
 .../lib/common-features.ts                    |  4 +-
 .../group-lines-into-sections.ts              |  7 ++-
 7 files changed, 65 insertions(+), 59 deletions(-)

diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts
index 52f2b5d..16cabac 100644
--- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts
+++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts
@@ -15,7 +15,7 @@ import {
 import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
 import {
   getBulletPointsFromLines,
-  getFirstBulletPointLineIdx,
+  getDescriptionsLineIdx,
 } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
 
 /**
@@ -37,7 +37,7 @@ const hasDegree = (item: TextItem) =>
 const matchGPA = (item: TextItem) => item.text.match(/[0-4]\.\d{1,2}/);
 const matchGrade = (item: TextItem) => {
   const grade = parseFloat(item.text);
-  if (Number.isFinite(grade)) {
+  if (Number.isFinite(grade) && grade <= 110) {
     return [String(grade)] as RegExpMatchArray;
   }
   return null;
@@ -87,15 +87,10 @@ export const extractEducation = (sections: ResumeSectionToLines) => {
     );
 
     let descriptions: string[] = [];
-    const firstBulletPointLineIdx = getFirstBulletPointLineIdx(
-      subsectionLines,
-      [":"]
-    );
-    if (firstBulletPointLineIdx !== undefined) {
-      const subsectionBulletPointLines = subsectionLines.slice(
-        firstBulletPointLineIdx
-      );
-      descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
+    const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines);
+    if (descriptionsLineIdx !== undefined) {
+      const descriptionsLines = subsectionLines.slice(descriptionsLineIdx);
+      descriptions = getBulletPointsFromLines(descriptionsLines);
     }
 
     educations.push({ school, degree, gpa, date, descriptions });
diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts
index 642e7d2..29181c9 100644
--- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts
+++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts
@@ -13,7 +13,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract-
 import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
 import {
   getBulletPointsFromLines,
-  getFirstBulletPointLineIdx,
+  getDescriptionsLineIdx,
 } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
 
 export const extractProject = (sections: ResumeSectionToLines) => {
@@ -23,11 +23,10 @@ export const extractProject = (sections: ResumeSectionToLines) => {
   const subsections = divideSectionIntoSubsections(lines);
 
   for (const subsectionLines of subsections) {
-    const firstBulletPointLineIdx =
-      getFirstBulletPointLineIdx(subsectionLines) ?? 1;
+    const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 1;
 
     const subsectionInfoTextItems = subsectionLines
-      .slice(0, firstBulletPointLineIdx)
+      .slice(0, descriptionsLineIdx)
       .flat();
     const [date, dateScores] = getTextWithHighestFeatureScore(
       subsectionInfoTextItems,
@@ -43,10 +42,8 @@ export const extractProject = (sections: ResumeSectionToLines) => {
       false
     );
 
-    const subsectionBulletPointLines = subsectionLines.slice(
-      firstBulletPointLineIdx
-    );
-    const descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
+    const descriptionsLines = subsectionLines.slice(descriptionsLineIdx);
+    const descriptions = getBulletPointsFromLines(descriptionsLines);
 
     projects.push({ project, date, descriptions });
     projectsScores.push({
diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts
index 61f61cc..b0846d9 100644
--- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts
+++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts
@@ -5,20 +5,18 @@ import { getSectionLinesByKeywords } from "lib/parse-resume-from-pdf/extract-res
 import { initialFeaturedSkills } from "lib/redux/resumeSlice";
 import {
   getBulletPointsFromLines,
-  getFirstBulletPointLineIdx,
+  getDescriptionsLineIdx,
 } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
 
 export const extractSkills = (sections: ResumeSectionToLines) => {
   const lines = getSectionLinesByKeywords(sections, ["skill"]);
-  const descriptions = getBulletPointsFromLines(lines);
+  const descriptionsLineIdx = getDescriptionsLineIdx(lines) ?? 0;
+  const descriptionsLines = lines.slice(descriptionsLineIdx);
+  const descriptions = getBulletPointsFromLines(descriptionsLines);
 
   const featuredSkills = deepClone(initialFeaturedSkills) as FeaturedSkill[];
-  const firstBulletPointLineIndex = getFirstBulletPointLineIdx(lines);
-  if (
-    firstBulletPointLineIndex !== undefined &&
-    firstBulletPointLineIndex !== 0
-  ) {
-    const featuredSkillsLines = lines.slice(0, firstBulletPointLineIndex);
+  if (descriptionsLineIdx !== 0) {
+    const featuredSkillsLines = lines.slice(0, descriptionsLineIdx);
     const featuredSkillsTextItems = featuredSkillsLines
       .flat()
       .filter((item) => item.text.trim())
diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts
index f0ec27f..bb4c843 100644
--- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts
+++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts
@@ -15,7 +15,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract-
 import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
 import {
   getBulletPointsFromLines,
-  getFirstBulletPointLineIdx,
+  getDescriptionsLineIdx,
 } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
 
 // prettier-ignore
@@ -44,11 +44,10 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => {
   const subsections = divideSectionIntoSubsections(lines);
 
   for (const subsectionLines of subsections) {
-    const firstBulletPointLineIdx =
-      getFirstBulletPointLineIdx(subsectionLines) ?? 2;
+    const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 2;
 
     const subsectionInfoTextItems = subsectionLines
-      .slice(0, firstBulletPointLineIdx)
+      .slice(0, descriptionsLineIdx)
       .flat();
     const [date, dateScores] = getTextWithHighestFeatureScore(
       subsectionInfoTextItems,
@@ -69,10 +68,9 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => {
       false
     );
 
-    const subsectionBulletPointLines = subsectionLines.slice(
-      firstBulletPointLineIdx
-    );
-    const descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
+    const subsectionDescriptionsLines =
+      subsectionLines.slice(descriptionsLineIdx);
+    const descriptions = getBulletPointsFromLines(subsectionDescriptionsLines);
 
     workExperiences.push({ company, jobTitle, date, descriptions });
     workExperiencesScores.push({
diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts
index 95c96a0..f0db1ef 100644
--- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts
+++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts
@@ -1,4 +1,4 @@
-import type { Lines } from "lib/parse-resume-from-pdf/types";
+import type { Lines, TextItem } from "lib/parse-resume-from-pdf/types";
 
 /**
  * List of bullet points
@@ -66,24 +66,6 @@ export const getBulletPointsFromLines = (lines: Lines): string[] => {
     .filter((text) => !!text);
 };
 
-export const getFirstBulletPointLineIdx = (
-  lines: Lines,
-  additionalChars: string[] = []
-): number | undefined => {
-  for (let i = 0; i < lines.length; i++) {
-    for (let item of lines[i]) {
-      if (
-        [...BULLET_POINTS, ...additionalChars].some((bullet) =>
-          item.text.includes(bullet)
-        )
-      ) {
-        return i;
-      }
-    }
-  }
-  return undefined;
-};
-
 const getMostCommonBulletPoint = (str: string): string => {
   const bulletToCount: { [bullet: string]: number } = BULLET_POINTS.reduce(
     (acc: { [bullet: string]: number }, cur) => {
@@ -104,3 +86,38 @@ const getMostCommonBulletPoint = (str: string): string => {
   }
   return bulletWithMostCount;
 };
+
+const getFirstBulletPointLineIdx = (lines: Lines): number | undefined => {
+  for (let i = 0; i < lines.length; i++) {
+    for (let item of lines[i]) {
+      if (BULLET_POINTS.some((bullet) => item.text.includes(bullet))) {
+        return i;
+      }
+    }
+  }
+  return undefined;
+};
+
+// Only consider words that don't contain numbers
+const isWord = (str: string) => /^[^0-9]+$/.test(str);
+const hasAtLeast8Words = (item: TextItem) =>
+  item.text.split(/\s/).filter(isWord).length >= 8;
+
+export const getDescriptionsLineIdx = (lines: Lines): number | undefined => {
+  // The main heuristic to determine descriptions is to check if has bullet point
+  let idx = getFirstBulletPointLineIdx(lines);
+
+  // Fallback heuristic if the main heuristic doesn't apply (e.g. LinkedIn resume) to
+  // check if the line has at least 8 words
+  if (idx === undefined) {
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i];
+      if (line.length === 1 && hasAtLeast8Words(line[0])) {
+        idx = i;
+        break;
+      }
+    }
+  }
+
+  return idx;
+};
diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts
index d21f61d..9caebb3 100644
--- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts
+++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts
@@ -8,8 +8,8 @@ export const hasNumber = (item: TextItem) => /[0-9]/.test(item.text);
 export const hasComma = (item: TextItem) => item.text.includes(",");
 export const getHasText = (text: string) => (item: TextItem) =>
   item.text.includes(text);
-export const hasOnlyLettersAndSpaces = (item: TextItem) =>
-  /^[A-Za-z\s]+$/.test(item.text);
+export const hasOnlyLettersSpacesAmpersands = (item: TextItem) =>
+  /^[A-Za-z\s&]+$/.test(item.text);
 export const hasLetterAndIsAllUpperCase = (item: TextItem) =>
   hasLetter(item) && item.text.toUpperCase() === item.text;
 
diff --git a/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts b/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts
index 117b085..d78ffb3 100644
--- a/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts
+++ b/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts
@@ -6,7 +6,7 @@ import type {
 } from "lib/parse-resume-from-pdf/types";
 import {
   hasLetterAndIsAllUpperCase,
-  hasOnlyLettersAndSpaces,
+  hasOnlyLettersSpacesAmpersands,
   isBold,
 } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features";
 
@@ -81,12 +81,13 @@ const isSectionTitle = (line: Line, lineNumber: number) => {
   // The following is a fallback heuristic to detect section title if it includes a keyword match
   // (This heuristics is not well tested and may not work well)
   const text = textItem.text.trim();
-  const textHasAtMost2Words = text.split(" ").length <= 2;
+  const textHasAtMost2Words =
+    text.split(" ").filter((s) => s !== "&").length <= 2;
   const startsWithCapitalLetter = /[A-Z]/.test(text.slice(0, 1));
 
   if (
     textHasAtMost2Words &&
-    hasOnlyLettersAndSpaces(textItem) &&
+    hasOnlyLettersSpacesAmpersands(textItem) &&
     startsWithCapitalLetter &&
     SECTION_TITLE_KEYWORDS.some((keyword) =>
       text.toLowerCase().includes(keyword)