From 68ac64036a3040fce0537f765eb1c88570be87d2 Mon Sep 17 00:00:00 2001 From: Xitang Date: Thu, 6 Jul 2023 00:11:51 -0700 Subject: [PATCH] Create getDescriptionsLineIdx and refactor getFirstBulletPointLineIdx - #5 --- .../extract-education.ts | 17 ++---- .../extract-project.ts | 13 ++--- .../extract-skills.ts | 14 ++--- .../extract-work-experience.ts | 14 ++--- .../lib/bullet-points.ts | 55 ++++++++++++------- .../lib/common-features.ts | 4 +- .../group-lines-into-sections.ts | 7 ++- 7 files changed, 65 insertions(+), 59 deletions(-) diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts index 52f2b5d..16cabac 100644 --- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts +++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-education.ts @@ -15,7 +15,7 @@ import { import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system"; import { getBulletPointsFromLines, - getFirstBulletPointLineIdx, + getDescriptionsLineIdx, } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; /** @@ -37,7 +37,7 @@ const hasDegree = (item: TextItem) => const matchGPA = (item: TextItem) => item.text.match(/[0-4]\.\d{1,2}/); const matchGrade = (item: TextItem) => { const grade = parseFloat(item.text); - if (Number.isFinite(grade)) { + if (Number.isFinite(grade) && grade <= 110) { return [String(grade)] as RegExpMatchArray; } return null; @@ -87,15 +87,10 @@ export const extractEducation = (sections: ResumeSectionToLines) => { ); let descriptions: string[] = []; - const firstBulletPointLineIdx = getFirstBulletPointLineIdx( - subsectionLines, - [":"] - ); - if (firstBulletPointLineIdx !== undefined) { - const subsectionBulletPointLines = subsectionLines.slice( - firstBulletPointLineIdx - ); - descriptions = getBulletPointsFromLines(subsectionBulletPointLines); + const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines); + if (descriptionsLineIdx !== undefined) { + const descriptionsLines = subsectionLines.slice(descriptionsLineIdx); + descriptions = getBulletPointsFromLines(descriptionsLines); } educations.push({ school, degree, gpa, date, descriptions }); diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts index 642e7d2..29181c9 100644 --- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts +++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-project.ts @@ -13,7 +13,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract- import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system"; import { getBulletPointsFromLines, - getFirstBulletPointLineIdx, + getDescriptionsLineIdx, } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; export const extractProject = (sections: ResumeSectionToLines) => { @@ -23,11 +23,10 @@ export const extractProject = (sections: ResumeSectionToLines) => { const subsections = divideSectionIntoSubsections(lines); for (const subsectionLines of subsections) { - const firstBulletPointLineIdx = - getFirstBulletPointLineIdx(subsectionLines) ?? 1; + const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 1; const subsectionInfoTextItems = subsectionLines - .slice(0, firstBulletPointLineIdx) + .slice(0, descriptionsLineIdx) .flat(); const [date, dateScores] = getTextWithHighestFeatureScore( subsectionInfoTextItems, @@ -43,10 +42,8 @@ export const extractProject = (sections: ResumeSectionToLines) => { false ); - const subsectionBulletPointLines = subsectionLines.slice( - firstBulletPointLineIdx - ); - const descriptions = getBulletPointsFromLines(subsectionBulletPointLines); + const descriptionsLines = subsectionLines.slice(descriptionsLineIdx); + const descriptions = getBulletPointsFromLines(descriptionsLines); projects.push({ project, date, descriptions }); projectsScores.push({ diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts index 61f61cc..b0846d9 100644 --- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts +++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-skills.ts @@ -5,20 +5,18 @@ import { getSectionLinesByKeywords } from "lib/parse-resume-from-pdf/extract-res import { initialFeaturedSkills } from "lib/redux/resumeSlice"; import { getBulletPointsFromLines, - getFirstBulletPointLineIdx, + getDescriptionsLineIdx, } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; export const extractSkills = (sections: ResumeSectionToLines) => { const lines = getSectionLinesByKeywords(sections, ["skill"]); - const descriptions = getBulletPointsFromLines(lines); + const descriptionsLineIdx = getDescriptionsLineIdx(lines) ?? 0; + const descriptionsLines = lines.slice(descriptionsLineIdx); + const descriptions = getBulletPointsFromLines(descriptionsLines); const featuredSkills = deepClone(initialFeaturedSkills) as FeaturedSkill[]; - const firstBulletPointLineIndex = getFirstBulletPointLineIdx(lines); - if ( - firstBulletPointLineIndex !== undefined && - firstBulletPointLineIndex !== 0 - ) { - const featuredSkillsLines = lines.slice(0, firstBulletPointLineIndex); + if (descriptionsLineIdx !== 0) { + const featuredSkillsLines = lines.slice(0, descriptionsLineIdx); const featuredSkillsTextItems = featuredSkillsLines .flat() .filter((item) => item.text.trim()) diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts index f0ec27f..bb4c843 100644 --- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts +++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/extract-work-experience.ts @@ -15,7 +15,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract- import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system"; import { getBulletPointsFromLines, - getFirstBulletPointLineIdx, + getDescriptionsLineIdx, } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; // prettier-ignore @@ -44,11 +44,10 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => { const subsections = divideSectionIntoSubsections(lines); for (const subsectionLines of subsections) { - const firstBulletPointLineIdx = - getFirstBulletPointLineIdx(subsectionLines) ?? 2; + const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 2; const subsectionInfoTextItems = subsectionLines - .slice(0, firstBulletPointLineIdx) + .slice(0, descriptionsLineIdx) .flat(); const [date, dateScores] = getTextWithHighestFeatureScore( subsectionInfoTextItems, @@ -69,10 +68,9 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => { false ); - const subsectionBulletPointLines = subsectionLines.slice( - firstBulletPointLineIdx - ); - const descriptions = getBulletPointsFromLines(subsectionBulletPointLines); + const subsectionDescriptionsLines = + subsectionLines.slice(descriptionsLineIdx); + const descriptions = getBulletPointsFromLines(subsectionDescriptionsLines); workExperiences.push({ company, jobTitle, date, descriptions }); workExperiencesScores.push({ diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts index 95c96a0..f0db1ef 100644 --- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts +++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points.ts @@ -1,4 +1,4 @@ -import type { Lines } from "lib/parse-resume-from-pdf/types"; +import type { Lines, TextItem } from "lib/parse-resume-from-pdf/types"; /** * List of bullet points @@ -66,24 +66,6 @@ export const getBulletPointsFromLines = (lines: Lines): string[] => { .filter((text) => !!text); }; -export const getFirstBulletPointLineIdx = ( - lines: Lines, - additionalChars: string[] = [] -): number | undefined => { - for (let i = 0; i < lines.length; i++) { - for (let item of lines[i]) { - if ( - [...BULLET_POINTS, ...additionalChars].some((bullet) => - item.text.includes(bullet) - ) - ) { - return i; - } - } - } - return undefined; -}; - const getMostCommonBulletPoint = (str: string): string => { const bulletToCount: { [bullet: string]: number } = BULLET_POINTS.reduce( (acc: { [bullet: string]: number }, cur) => { @@ -104,3 +86,38 @@ const getMostCommonBulletPoint = (str: string): string => { } return bulletWithMostCount; }; + +const getFirstBulletPointLineIdx = (lines: Lines): number | undefined => { + for (let i = 0; i < lines.length; i++) { + for (let item of lines[i]) { + if (BULLET_POINTS.some((bullet) => item.text.includes(bullet))) { + return i; + } + } + } + return undefined; +}; + +// Only consider words that don't contain numbers +const isWord = (str: string) => /^[^0-9]+$/.test(str); +const hasAtLeast8Words = (item: TextItem) => + item.text.split(/\s/).filter(isWord).length >= 8; + +export const getDescriptionsLineIdx = (lines: Lines): number | undefined => { + // The main heuristic to determine descriptions is to check if has bullet point + let idx = getFirstBulletPointLineIdx(lines); + + // Fallback heuristic if the main heuristic doesn't apply (e.g. LinkedIn resume) to + // check if the line has at least 8 words + if (idx === undefined) { + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + if (line.length === 1 && hasAtLeast8Words(line[0])) { + idx = i; + break; + } + } + } + + return idx; +}; diff --git a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts index d21f61d..9caebb3 100644 --- a/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts +++ b/src/app/lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features.ts @@ -8,8 +8,8 @@ export const hasNumber = (item: TextItem) => /[0-9]/.test(item.text); export const hasComma = (item: TextItem) => item.text.includes(","); export const getHasText = (text: string) => (item: TextItem) => item.text.includes(text); -export const hasOnlyLettersAndSpaces = (item: TextItem) => - /^[A-Za-z\s]+$/.test(item.text); +export const hasOnlyLettersSpacesAmpersands = (item: TextItem) => + /^[A-Za-z\s&]+$/.test(item.text); export const hasLetterAndIsAllUpperCase = (item: TextItem) => hasLetter(item) && item.text.toUpperCase() === item.text; diff --git a/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts b/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts index 117b085..d78ffb3 100644 --- a/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts +++ b/src/app/lib/parse-resume-from-pdf/group-lines-into-sections.ts @@ -6,7 +6,7 @@ import type { } from "lib/parse-resume-from-pdf/types"; import { hasLetterAndIsAllUpperCase, - hasOnlyLettersAndSpaces, + hasOnlyLettersSpacesAmpersands, isBold, } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features"; @@ -81,12 +81,13 @@ const isSectionTitle = (line: Line, lineNumber: number) => { // The following is a fallback heuristic to detect section title if it includes a keyword match // (This heuristics is not well tested and may not work well) const text = textItem.text.trim(); - const textHasAtMost2Words = text.split(" ").length <= 2; + const textHasAtMost2Words = + text.split(" ").filter((s) => s !== "&").length <= 2; const startsWithCapitalLetter = /[A-Z]/.test(text.slice(0, 1)); if ( textHasAtMost2Words && - hasOnlyLettersAndSpaces(textItem) && + hasOnlyLettersSpacesAmpersands(textItem) && startsWithCapitalLetter && SECTION_TITLE_KEYWORDS.some((keyword) => text.toLowerCase().includes(keyword)