1
mirror of https://github.com/xitanggg/open-resume synced 2024-11-03 09:19:21 +01:00

Create getDescriptionsLineIdx and refactor getFirstBulletPointLineIdx - #5

This commit is contained in:
Xitang 2023-07-06 00:11:51 -07:00
parent d29b899134
commit 68ac64036a
7 changed files with 65 additions and 59 deletions

View File

@ -15,7 +15,7 @@ import {
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system"; import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
import { import {
getBulletPointsFromLines, getBulletPointsFromLines,
getFirstBulletPointLineIdx, getDescriptionsLineIdx,
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
/** /**
@ -37,7 +37,7 @@ const hasDegree = (item: TextItem) =>
const matchGPA = (item: TextItem) => item.text.match(/[0-4]\.\d{1,2}/); const matchGPA = (item: TextItem) => item.text.match(/[0-4]\.\d{1,2}/);
const matchGrade = (item: TextItem) => { const matchGrade = (item: TextItem) => {
const grade = parseFloat(item.text); const grade = parseFloat(item.text);
if (Number.isFinite(grade)) { if (Number.isFinite(grade) && grade <= 110) {
return [String(grade)] as RegExpMatchArray; return [String(grade)] as RegExpMatchArray;
} }
return null; return null;
@ -87,15 +87,10 @@ export const extractEducation = (sections: ResumeSectionToLines) => {
); );
let descriptions: string[] = []; let descriptions: string[] = [];
const firstBulletPointLineIdx = getFirstBulletPointLineIdx( const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines);
subsectionLines, if (descriptionsLineIdx !== undefined) {
[":"] const descriptionsLines = subsectionLines.slice(descriptionsLineIdx);
); descriptions = getBulletPointsFromLines(descriptionsLines);
if (firstBulletPointLineIdx !== undefined) {
const subsectionBulletPointLines = subsectionLines.slice(
firstBulletPointLineIdx
);
descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
} }
educations.push({ school, degree, gpa, date, descriptions }); educations.push({ school, degree, gpa, date, descriptions });

View File

@ -13,7 +13,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract-
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system"; import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
import { import {
getBulletPointsFromLines, getBulletPointsFromLines,
getFirstBulletPointLineIdx, getDescriptionsLineIdx,
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
export const extractProject = (sections: ResumeSectionToLines) => { export const extractProject = (sections: ResumeSectionToLines) => {
@ -23,11 +23,10 @@ export const extractProject = (sections: ResumeSectionToLines) => {
const subsections = divideSectionIntoSubsections(lines); const subsections = divideSectionIntoSubsections(lines);
for (const subsectionLines of subsections) { for (const subsectionLines of subsections) {
const firstBulletPointLineIdx = const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 1;
getFirstBulletPointLineIdx(subsectionLines) ?? 1;
const subsectionInfoTextItems = subsectionLines const subsectionInfoTextItems = subsectionLines
.slice(0, firstBulletPointLineIdx) .slice(0, descriptionsLineIdx)
.flat(); .flat();
const [date, dateScores] = getTextWithHighestFeatureScore( const [date, dateScores] = getTextWithHighestFeatureScore(
subsectionInfoTextItems, subsectionInfoTextItems,
@ -43,10 +42,8 @@ export const extractProject = (sections: ResumeSectionToLines) => {
false false
); );
const subsectionBulletPointLines = subsectionLines.slice( const descriptionsLines = subsectionLines.slice(descriptionsLineIdx);
firstBulletPointLineIdx const descriptions = getBulletPointsFromLines(descriptionsLines);
);
const descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
projects.push({ project, date, descriptions }); projects.push({ project, date, descriptions });
projectsScores.push({ projectsScores.push({

View File

@ -5,20 +5,18 @@ import { getSectionLinesByKeywords } from "lib/parse-resume-from-pdf/extract-res
import { initialFeaturedSkills } from "lib/redux/resumeSlice"; import { initialFeaturedSkills } from "lib/redux/resumeSlice";
import { import {
getBulletPointsFromLines, getBulletPointsFromLines,
getFirstBulletPointLineIdx, getDescriptionsLineIdx,
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
export const extractSkills = (sections: ResumeSectionToLines) => { export const extractSkills = (sections: ResumeSectionToLines) => {
const lines = getSectionLinesByKeywords(sections, ["skill"]); const lines = getSectionLinesByKeywords(sections, ["skill"]);
const descriptions = getBulletPointsFromLines(lines); const descriptionsLineIdx = getDescriptionsLineIdx(lines) ?? 0;
const descriptionsLines = lines.slice(descriptionsLineIdx);
const descriptions = getBulletPointsFromLines(descriptionsLines);
const featuredSkills = deepClone(initialFeaturedSkills) as FeaturedSkill[]; const featuredSkills = deepClone(initialFeaturedSkills) as FeaturedSkill[];
const firstBulletPointLineIndex = getFirstBulletPointLineIdx(lines); if (descriptionsLineIdx !== 0) {
if ( const featuredSkillsLines = lines.slice(0, descriptionsLineIdx);
firstBulletPointLineIndex !== undefined &&
firstBulletPointLineIndex !== 0
) {
const featuredSkillsLines = lines.slice(0, firstBulletPointLineIndex);
const featuredSkillsTextItems = featuredSkillsLines const featuredSkillsTextItems = featuredSkillsLines
.flat() .flat()
.filter((item) => item.text.trim()) .filter((item) => item.text.trim())

View File

@ -15,7 +15,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract-
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system"; import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
import { import {
getBulletPointsFromLines, getBulletPointsFromLines,
getFirstBulletPointLineIdx, getDescriptionsLineIdx,
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points"; } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
// prettier-ignore // prettier-ignore
@ -44,11 +44,10 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => {
const subsections = divideSectionIntoSubsections(lines); const subsections = divideSectionIntoSubsections(lines);
for (const subsectionLines of subsections) { for (const subsectionLines of subsections) {
const firstBulletPointLineIdx = const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 2;
getFirstBulletPointLineIdx(subsectionLines) ?? 2;
const subsectionInfoTextItems = subsectionLines const subsectionInfoTextItems = subsectionLines
.slice(0, firstBulletPointLineIdx) .slice(0, descriptionsLineIdx)
.flat(); .flat();
const [date, dateScores] = getTextWithHighestFeatureScore( const [date, dateScores] = getTextWithHighestFeatureScore(
subsectionInfoTextItems, subsectionInfoTextItems,
@ -69,10 +68,9 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => {
false false
); );
const subsectionBulletPointLines = subsectionLines.slice( const subsectionDescriptionsLines =
firstBulletPointLineIdx subsectionLines.slice(descriptionsLineIdx);
); const descriptions = getBulletPointsFromLines(subsectionDescriptionsLines);
const descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
workExperiences.push({ company, jobTitle, date, descriptions }); workExperiences.push({ company, jobTitle, date, descriptions });
workExperiencesScores.push({ workExperiencesScores.push({

View File

@ -1,4 +1,4 @@
import type { Lines } from "lib/parse-resume-from-pdf/types"; import type { Lines, TextItem } from "lib/parse-resume-from-pdf/types";
/** /**
* List of bullet points * List of bullet points
@ -66,24 +66,6 @@ export const getBulletPointsFromLines = (lines: Lines): string[] => {
.filter((text) => !!text); .filter((text) => !!text);
}; };
export const getFirstBulletPointLineIdx = (
lines: Lines,
additionalChars: string[] = []
): number | undefined => {
for (let i = 0; i < lines.length; i++) {
for (let item of lines[i]) {
if (
[...BULLET_POINTS, ...additionalChars].some((bullet) =>
item.text.includes(bullet)
)
) {
return i;
}
}
}
return undefined;
};
const getMostCommonBulletPoint = (str: string): string => { const getMostCommonBulletPoint = (str: string): string => {
const bulletToCount: { [bullet: string]: number } = BULLET_POINTS.reduce( const bulletToCount: { [bullet: string]: number } = BULLET_POINTS.reduce(
(acc: { [bullet: string]: number }, cur) => { (acc: { [bullet: string]: number }, cur) => {
@ -104,3 +86,38 @@ const getMostCommonBulletPoint = (str: string): string => {
} }
return bulletWithMostCount; return bulletWithMostCount;
}; };
const getFirstBulletPointLineIdx = (lines: Lines): number | undefined => {
for (let i = 0; i < lines.length; i++) {
for (let item of lines[i]) {
if (BULLET_POINTS.some((bullet) => item.text.includes(bullet))) {
return i;
}
}
}
return undefined;
};
// Only consider words that don't contain numbers
const isWord = (str: string) => /^[^0-9]+$/.test(str);
const hasAtLeast8Words = (item: TextItem) =>
item.text.split(/\s/).filter(isWord).length >= 8;
export const getDescriptionsLineIdx = (lines: Lines): number | undefined => {
// The main heuristic to determine descriptions is to check if has bullet point
let idx = getFirstBulletPointLineIdx(lines);
// Fallback heuristic if the main heuristic doesn't apply (e.g. LinkedIn resume) to
// check if the line has at least 8 words
if (idx === undefined) {
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line.length === 1 && hasAtLeast8Words(line[0])) {
idx = i;
break;
}
}
}
return idx;
};

View File

@ -8,8 +8,8 @@ export const hasNumber = (item: TextItem) => /[0-9]/.test(item.text);
export const hasComma = (item: TextItem) => item.text.includes(","); export const hasComma = (item: TextItem) => item.text.includes(",");
export const getHasText = (text: string) => (item: TextItem) => export const getHasText = (text: string) => (item: TextItem) =>
item.text.includes(text); item.text.includes(text);
export const hasOnlyLettersAndSpaces = (item: TextItem) => export const hasOnlyLettersSpacesAmpersands = (item: TextItem) =>
/^[A-Za-z\s]+$/.test(item.text); /^[A-Za-z\s&]+$/.test(item.text);
export const hasLetterAndIsAllUpperCase = (item: TextItem) => export const hasLetterAndIsAllUpperCase = (item: TextItem) =>
hasLetter(item) && item.text.toUpperCase() === item.text; hasLetter(item) && item.text.toUpperCase() === item.text;

View File

@ -6,7 +6,7 @@ import type {
} from "lib/parse-resume-from-pdf/types"; } from "lib/parse-resume-from-pdf/types";
import { import {
hasLetterAndIsAllUpperCase, hasLetterAndIsAllUpperCase,
hasOnlyLettersAndSpaces, hasOnlyLettersSpacesAmpersands,
isBold, isBold,
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features"; } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features";
@ -81,12 +81,13 @@ const isSectionTitle = (line: Line, lineNumber: number) => {
// The following is a fallback heuristic to detect section title if it includes a keyword match // The following is a fallback heuristic to detect section title if it includes a keyword match
// (This heuristics is not well tested and may not work well) // (This heuristics is not well tested and may not work well)
const text = textItem.text.trim(); const text = textItem.text.trim();
const textHasAtMost2Words = text.split(" ").length <= 2; const textHasAtMost2Words =
text.split(" ").filter((s) => s !== "&").length <= 2;
const startsWithCapitalLetter = /[A-Z]/.test(text.slice(0, 1)); const startsWithCapitalLetter = /[A-Z]/.test(text.slice(0, 1));
if ( if (
textHasAtMost2Words && textHasAtMost2Words &&
hasOnlyLettersAndSpaces(textItem) && hasOnlyLettersSpacesAmpersands(textItem) &&
startsWithCapitalLetter && startsWithCapitalLetter &&
SECTION_TITLE_KEYWORDS.some((keyword) => SECTION_TITLE_KEYWORDS.some((keyword) =>
text.toLowerCase().includes(keyword) text.toLowerCase().includes(keyword)