mirror of
https://github.com/xitanggg/open-resume
synced 2024-11-03 09:19:21 +01:00
Create getDescriptionsLineIdx and refactor getFirstBulletPointLineIdx - #5
This commit is contained in:
parent
d29b899134
commit
68ac64036a
@ -15,7 +15,7 @@ import {
|
|||||||
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
|
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
|
||||||
import {
|
import {
|
||||||
getBulletPointsFromLines,
|
getBulletPointsFromLines,
|
||||||
getFirstBulletPointLineIdx,
|
getDescriptionsLineIdx,
|
||||||
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -37,7 +37,7 @@ const hasDegree = (item: TextItem) =>
|
|||||||
const matchGPA = (item: TextItem) => item.text.match(/[0-4]\.\d{1,2}/);
|
const matchGPA = (item: TextItem) => item.text.match(/[0-4]\.\d{1,2}/);
|
||||||
const matchGrade = (item: TextItem) => {
|
const matchGrade = (item: TextItem) => {
|
||||||
const grade = parseFloat(item.text);
|
const grade = parseFloat(item.text);
|
||||||
if (Number.isFinite(grade)) {
|
if (Number.isFinite(grade) && grade <= 110) {
|
||||||
return [String(grade)] as RegExpMatchArray;
|
return [String(grade)] as RegExpMatchArray;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
@ -87,15 +87,10 @@ export const extractEducation = (sections: ResumeSectionToLines) => {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let descriptions: string[] = [];
|
let descriptions: string[] = [];
|
||||||
const firstBulletPointLineIdx = getFirstBulletPointLineIdx(
|
const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines);
|
||||||
subsectionLines,
|
if (descriptionsLineIdx !== undefined) {
|
||||||
[":"]
|
const descriptionsLines = subsectionLines.slice(descriptionsLineIdx);
|
||||||
);
|
descriptions = getBulletPointsFromLines(descriptionsLines);
|
||||||
if (firstBulletPointLineIdx !== undefined) {
|
|
||||||
const subsectionBulletPointLines = subsectionLines.slice(
|
|
||||||
firstBulletPointLineIdx
|
|
||||||
);
|
|
||||||
descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
educations.push({ school, degree, gpa, date, descriptions });
|
educations.push({ school, degree, gpa, date, descriptions });
|
||||||
|
@ -13,7 +13,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract-
|
|||||||
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
|
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
|
||||||
import {
|
import {
|
||||||
getBulletPointsFromLines,
|
getBulletPointsFromLines,
|
||||||
getFirstBulletPointLineIdx,
|
getDescriptionsLineIdx,
|
||||||
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
||||||
|
|
||||||
export const extractProject = (sections: ResumeSectionToLines) => {
|
export const extractProject = (sections: ResumeSectionToLines) => {
|
||||||
@ -23,11 +23,10 @@ export const extractProject = (sections: ResumeSectionToLines) => {
|
|||||||
const subsections = divideSectionIntoSubsections(lines);
|
const subsections = divideSectionIntoSubsections(lines);
|
||||||
|
|
||||||
for (const subsectionLines of subsections) {
|
for (const subsectionLines of subsections) {
|
||||||
const firstBulletPointLineIdx =
|
const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 1;
|
||||||
getFirstBulletPointLineIdx(subsectionLines) ?? 1;
|
|
||||||
|
|
||||||
const subsectionInfoTextItems = subsectionLines
|
const subsectionInfoTextItems = subsectionLines
|
||||||
.slice(0, firstBulletPointLineIdx)
|
.slice(0, descriptionsLineIdx)
|
||||||
.flat();
|
.flat();
|
||||||
const [date, dateScores] = getTextWithHighestFeatureScore(
|
const [date, dateScores] = getTextWithHighestFeatureScore(
|
||||||
subsectionInfoTextItems,
|
subsectionInfoTextItems,
|
||||||
@ -43,10 +42,8 @@ export const extractProject = (sections: ResumeSectionToLines) => {
|
|||||||
false
|
false
|
||||||
);
|
);
|
||||||
|
|
||||||
const subsectionBulletPointLines = subsectionLines.slice(
|
const descriptionsLines = subsectionLines.slice(descriptionsLineIdx);
|
||||||
firstBulletPointLineIdx
|
const descriptions = getBulletPointsFromLines(descriptionsLines);
|
||||||
);
|
|
||||||
const descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
|
|
||||||
|
|
||||||
projects.push({ project, date, descriptions });
|
projects.push({ project, date, descriptions });
|
||||||
projectsScores.push({
|
projectsScores.push({
|
||||||
|
@ -5,20 +5,18 @@ import { getSectionLinesByKeywords } from "lib/parse-resume-from-pdf/extract-res
|
|||||||
import { initialFeaturedSkills } from "lib/redux/resumeSlice";
|
import { initialFeaturedSkills } from "lib/redux/resumeSlice";
|
||||||
import {
|
import {
|
||||||
getBulletPointsFromLines,
|
getBulletPointsFromLines,
|
||||||
getFirstBulletPointLineIdx,
|
getDescriptionsLineIdx,
|
||||||
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
||||||
|
|
||||||
export const extractSkills = (sections: ResumeSectionToLines) => {
|
export const extractSkills = (sections: ResumeSectionToLines) => {
|
||||||
const lines = getSectionLinesByKeywords(sections, ["skill"]);
|
const lines = getSectionLinesByKeywords(sections, ["skill"]);
|
||||||
const descriptions = getBulletPointsFromLines(lines);
|
const descriptionsLineIdx = getDescriptionsLineIdx(lines) ?? 0;
|
||||||
|
const descriptionsLines = lines.slice(descriptionsLineIdx);
|
||||||
|
const descriptions = getBulletPointsFromLines(descriptionsLines);
|
||||||
|
|
||||||
const featuredSkills = deepClone(initialFeaturedSkills) as FeaturedSkill[];
|
const featuredSkills = deepClone(initialFeaturedSkills) as FeaturedSkill[];
|
||||||
const firstBulletPointLineIndex = getFirstBulletPointLineIdx(lines);
|
if (descriptionsLineIdx !== 0) {
|
||||||
if (
|
const featuredSkillsLines = lines.slice(0, descriptionsLineIdx);
|
||||||
firstBulletPointLineIndex !== undefined &&
|
|
||||||
firstBulletPointLineIndex !== 0
|
|
||||||
) {
|
|
||||||
const featuredSkillsLines = lines.slice(0, firstBulletPointLineIndex);
|
|
||||||
const featuredSkillsTextItems = featuredSkillsLines
|
const featuredSkillsTextItems = featuredSkillsLines
|
||||||
.flat()
|
.flat()
|
||||||
.filter((item) => item.text.trim())
|
.filter((item) => item.text.trim())
|
||||||
|
@ -15,7 +15,7 @@ import { divideSectionIntoSubsections } from "lib/parse-resume-from-pdf/extract-
|
|||||||
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
|
import { getTextWithHighestFeatureScore } from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/feature-scoring-system";
|
||||||
import {
|
import {
|
||||||
getBulletPointsFromLines,
|
getBulletPointsFromLines,
|
||||||
getFirstBulletPointLineIdx,
|
getDescriptionsLineIdx,
|
||||||
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/bullet-points";
|
||||||
|
|
||||||
// prettier-ignore
|
// prettier-ignore
|
||||||
@ -44,11 +44,10 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => {
|
|||||||
const subsections = divideSectionIntoSubsections(lines);
|
const subsections = divideSectionIntoSubsections(lines);
|
||||||
|
|
||||||
for (const subsectionLines of subsections) {
|
for (const subsectionLines of subsections) {
|
||||||
const firstBulletPointLineIdx =
|
const descriptionsLineIdx = getDescriptionsLineIdx(subsectionLines) ?? 2;
|
||||||
getFirstBulletPointLineIdx(subsectionLines) ?? 2;
|
|
||||||
|
|
||||||
const subsectionInfoTextItems = subsectionLines
|
const subsectionInfoTextItems = subsectionLines
|
||||||
.slice(0, firstBulletPointLineIdx)
|
.slice(0, descriptionsLineIdx)
|
||||||
.flat();
|
.flat();
|
||||||
const [date, dateScores] = getTextWithHighestFeatureScore(
|
const [date, dateScores] = getTextWithHighestFeatureScore(
|
||||||
subsectionInfoTextItems,
|
subsectionInfoTextItems,
|
||||||
@ -69,10 +68,9 @@ export const extractWorkExperience = (sections: ResumeSectionToLines) => {
|
|||||||
false
|
false
|
||||||
);
|
);
|
||||||
|
|
||||||
const subsectionBulletPointLines = subsectionLines.slice(
|
const subsectionDescriptionsLines =
|
||||||
firstBulletPointLineIdx
|
subsectionLines.slice(descriptionsLineIdx);
|
||||||
);
|
const descriptions = getBulletPointsFromLines(subsectionDescriptionsLines);
|
||||||
const descriptions = getBulletPointsFromLines(subsectionBulletPointLines);
|
|
||||||
|
|
||||||
workExperiences.push({ company, jobTitle, date, descriptions });
|
workExperiences.push({ company, jobTitle, date, descriptions });
|
||||||
workExperiencesScores.push({
|
workExperiencesScores.push({
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import type { Lines } from "lib/parse-resume-from-pdf/types";
|
import type { Lines, TextItem } from "lib/parse-resume-from-pdf/types";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of bullet points
|
* List of bullet points
|
||||||
@ -66,24 +66,6 @@ export const getBulletPointsFromLines = (lines: Lines): string[] => {
|
|||||||
.filter((text) => !!text);
|
.filter((text) => !!text);
|
||||||
};
|
};
|
||||||
|
|
||||||
export const getFirstBulletPointLineIdx = (
|
|
||||||
lines: Lines,
|
|
||||||
additionalChars: string[] = []
|
|
||||||
): number | undefined => {
|
|
||||||
for (let i = 0; i < lines.length; i++) {
|
|
||||||
for (let item of lines[i]) {
|
|
||||||
if (
|
|
||||||
[...BULLET_POINTS, ...additionalChars].some((bullet) =>
|
|
||||||
item.text.includes(bullet)
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return undefined;
|
|
||||||
};
|
|
||||||
|
|
||||||
const getMostCommonBulletPoint = (str: string): string => {
|
const getMostCommonBulletPoint = (str: string): string => {
|
||||||
const bulletToCount: { [bullet: string]: number } = BULLET_POINTS.reduce(
|
const bulletToCount: { [bullet: string]: number } = BULLET_POINTS.reduce(
|
||||||
(acc: { [bullet: string]: number }, cur) => {
|
(acc: { [bullet: string]: number }, cur) => {
|
||||||
@ -104,3 +86,38 @@ const getMostCommonBulletPoint = (str: string): string => {
|
|||||||
}
|
}
|
||||||
return bulletWithMostCount;
|
return bulletWithMostCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const getFirstBulletPointLineIdx = (lines: Lines): number | undefined => {
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
for (let item of lines[i]) {
|
||||||
|
if (BULLET_POINTS.some((bullet) => item.text.includes(bullet))) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Only consider words that don't contain numbers
|
||||||
|
const isWord = (str: string) => /^[^0-9]+$/.test(str);
|
||||||
|
const hasAtLeast8Words = (item: TextItem) =>
|
||||||
|
item.text.split(/\s/).filter(isWord).length >= 8;
|
||||||
|
|
||||||
|
export const getDescriptionsLineIdx = (lines: Lines): number | undefined => {
|
||||||
|
// The main heuristic to determine descriptions is to check if has bullet point
|
||||||
|
let idx = getFirstBulletPointLineIdx(lines);
|
||||||
|
|
||||||
|
// Fallback heuristic if the main heuristic doesn't apply (e.g. LinkedIn resume) to
|
||||||
|
// check if the line has at least 8 words
|
||||||
|
if (idx === undefined) {
|
||||||
|
for (let i = 0; i < lines.length; i++) {
|
||||||
|
const line = lines[i];
|
||||||
|
if (line.length === 1 && hasAtLeast8Words(line[0])) {
|
||||||
|
idx = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return idx;
|
||||||
|
};
|
||||||
|
@ -8,8 +8,8 @@ export const hasNumber = (item: TextItem) => /[0-9]/.test(item.text);
|
|||||||
export const hasComma = (item: TextItem) => item.text.includes(",");
|
export const hasComma = (item: TextItem) => item.text.includes(",");
|
||||||
export const getHasText = (text: string) => (item: TextItem) =>
|
export const getHasText = (text: string) => (item: TextItem) =>
|
||||||
item.text.includes(text);
|
item.text.includes(text);
|
||||||
export const hasOnlyLettersAndSpaces = (item: TextItem) =>
|
export const hasOnlyLettersSpacesAmpersands = (item: TextItem) =>
|
||||||
/^[A-Za-z\s]+$/.test(item.text);
|
/^[A-Za-z\s&]+$/.test(item.text);
|
||||||
export const hasLetterAndIsAllUpperCase = (item: TextItem) =>
|
export const hasLetterAndIsAllUpperCase = (item: TextItem) =>
|
||||||
hasLetter(item) && item.text.toUpperCase() === item.text;
|
hasLetter(item) && item.text.toUpperCase() === item.text;
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import type {
|
|||||||
} from "lib/parse-resume-from-pdf/types";
|
} from "lib/parse-resume-from-pdf/types";
|
||||||
import {
|
import {
|
||||||
hasLetterAndIsAllUpperCase,
|
hasLetterAndIsAllUpperCase,
|
||||||
hasOnlyLettersAndSpaces,
|
hasOnlyLettersSpacesAmpersands,
|
||||||
isBold,
|
isBold,
|
||||||
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features";
|
} from "lib/parse-resume-from-pdf/extract-resume-from-sections/lib/common-features";
|
||||||
|
|
||||||
@ -81,12 +81,13 @@ const isSectionTitle = (line: Line, lineNumber: number) => {
|
|||||||
// The following is a fallback heuristic to detect section title if it includes a keyword match
|
// The following is a fallback heuristic to detect section title if it includes a keyword match
|
||||||
// (This heuristics is not well tested and may not work well)
|
// (This heuristics is not well tested and may not work well)
|
||||||
const text = textItem.text.trim();
|
const text = textItem.text.trim();
|
||||||
const textHasAtMost2Words = text.split(" ").length <= 2;
|
const textHasAtMost2Words =
|
||||||
|
text.split(" ").filter((s) => s !== "&").length <= 2;
|
||||||
const startsWithCapitalLetter = /[A-Z]/.test(text.slice(0, 1));
|
const startsWithCapitalLetter = /[A-Z]/.test(text.slice(0, 1));
|
||||||
|
|
||||||
if (
|
if (
|
||||||
textHasAtMost2Words &&
|
textHasAtMost2Words &&
|
||||||
hasOnlyLettersAndSpaces(textItem) &&
|
hasOnlyLettersSpacesAmpersands(textItem) &&
|
||||||
startsWithCapitalLetter &&
|
startsWithCapitalLetter &&
|
||||||
SECTION_TITLE_KEYWORDS.some((keyword) =>
|
SECTION_TITLE_KEYWORDS.some((keyword) =>
|
||||||
text.toLowerCase().includes(keyword)
|
text.toLowerCase().includes(keyword)
|
||||||
|
Loading…
Reference in New Issue
Block a user