diff --git a/README.md b/README.md index c3d3122..d9fc580 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ This parser is able to handle name patterns with and without comma: ``` ... [lastname] ..., ... [firstname] ..., [suffix] ``` +Without a comma, the string is first compared against identifiers for company names (Ltd., Inc. etc). If so the whole string will be returned as a company name. ### Supported parts - salutations (e.g. Mr, Mrs, Dr, etc.) @@ -49,6 +50,8 @@ This parser is able to handle name patterns with and without comma: - nicknames (parts within parenthesis, brackets etc.) - last names (also supports prefixes like von, de etc.) - suffixes (Jr, Senior, 3rd, PhD, etc.) +- titles (academic titles like Dr. med., Dr.h.c. etc) +- lastname extensions (nobility predicates like Gräfin, Baron) ### Other features - multi-language support for salutations, suffixes and lastname prefixes @@ -77,6 +80,7 @@ $parser = new TheIconic\NameParser\Parser(); $name = $parser->parse($input); +echo $name->getCompany(); echo $name->getSalutation(); echo $name->getFirstname(); echo $name->getLastname(); @@ -84,6 +88,8 @@ echo $name->getMiddlename(); echo $name->getNickname(); echo $name->getInitials(); echo $name->getSuffix(); +echo $name->getTitle(); +echo $name->getExtension(); print_r($name->getAll()); diff --git a/phpstan.neon b/phpstan.neon new file mode 100644 index 0000000..a42de54 --- /dev/null +++ b/phpstan.neon @@ -0,0 +1,4 @@ +parameters: + inferPrivatePropertyTypeFromConstructor: true + ignoreErrors: + - '#Array has [0-9] duplicate keys with value *#' \ No newline at end of file diff --git a/src/Language/English.php b/src/Language/English.php index 5920268..4db20a1 100644 --- a/src/Language/English.php +++ b/src/Language/English.php @@ -81,4 +81,23 @@ public function getLastnamePrefixes(): array { return self::LASTNAME_PREFIXES; } + + // see German for further information + public function getExtensions(): array + { + return []; + } + + // see German for further information + public function getTitles(): array + { + return []; + } + + // see German for further information + public function getCompanies(): array + { + return []; + } } + diff --git a/src/Language/German.php b/src/Language/German.php index 491bb4c..0f71696 100644 --- a/src/Language/German.php +++ b/src/Language/German.php @@ -4,7 +4,28 @@ use TheIconic\NameParser\LanguageInterface; -class German implements LanguageInterface +/** + * The only structured definition of name parts and their allowed values + * I could found on the web are according to HL7 (health level 7) standard + * @see https://simplifier.net/guide/LeitfadenBasisDE/PatientimVersichertenstammdatenmanagementVSDM + * + * Names in Germany could be structured like this: + * "[SALUTATION] [TITLES] [FIRSTNAME] [MIDDLENAMES] [EXTENSION] [LASTNAME_PREFIX] [LASTNAME], [SUFFIX]" + * 1 n 1 n 1 1 1 n + * @see https://wiki.hl7.de/index.php?title=bp:Personennamen + * + * Example: + * "Herr Prof. Dr. med. Dr. rer. nat. Fritz Julius Karl Freiherr von und zu Rathenburg vor der Isar, MdB" + * Where LASTNAME (FAMILYNAME/SURNAME) is "Rathenburg vor der Isar" + * It is rather difficult to parse this name correctly, because "vor der" in this case + * is part of the LASTNAME, but can also be a defined LASTNAME PREFIX! + * + * Derived from the standard for sorting (DIN 5007-2), + * the name can also be written in this way (listings): + * "Rathenburg vor der Isar, Fritz Julius Karl Freiherr von und zu" + */ + + class German implements LanguageInterface { const SUFFIXES = [ '1.' => '1.', @@ -17,18 +38,607 @@ class German implements LanguageInterface 'iii' => 'III', 'iv' => 'IV', 'v' => 'V', + 'jr.' => 'Jr.', + 'junior' => 'Junior', + 'senior' => 'Senior', + 'sr.' => 'Sr.', + 'der ältere' => 'der Ältere', + 'd. ä.' => 'd. Ä.', + 'd.ä.' => 'd.Ä.', + 'der jüngere' => 'der Jüngere', + 'd. j.' => 'd. J.', + 'd.j.' => 'd.J.', ]; const SALUTATIONS = [ 'herr' => 'Herr', 'hr' => 'Herr', 'frau' => 'Frau', - 'fr' => 'Frau' + 'fr' => 'Frau', ]; + /** + * the following list is according to HL7 (Health Level 7) standard + * @see https://www.vdek.com/vertragspartner/arbeitgeber/deuev/_jcr_content/par/download_8/file.res/Anlage_07_Vers.pdf + */ const LASTNAME_PREFIXES = [ + 'a' => 'a', + 'aan de' => 'aan de', + 'aan den' => 'aan den', + 'al' => 'al', + 'am' => 'am', + 'an' => 'an', + 'an der' => 'an der', + 'auf' => 'auf', + 'auf dem' => 'auf dem', + 'auf der' => 'auf der', + 'auf m' => 'auf m', + 'aufm' => 'aufm', + 'auff m' => 'auff m', + 'aus' => 'aus', + 'aus dem' => 'aus dem', + 'aus den' => 'aus den', + 'aus der' => 'aus der', + 'b' => 'b', + 'be' => 'be', + 'bei' => 'bei', + 'bei der' => 'bei der', + 'beim' => 'beim', + 'ben' => 'ben', + 'bey' => 'bey', + 'bey der' => 'bey der', + 'che' => 'che', + 'cid' => 'cid', + 'd' => 'd', + 'd.' => 'd.', + "d'" => "d'", + 'da' => 'da', + 'da costa' => 'da costa', + 'da las' => 'da las', + 'da silva' => 'da silva', + 'dal' => 'dal', + 'dall' => 'dall', + "dall'" => "dall'", + 'dalla' => 'dalla', + 'dalle' => 'dalle', + 'dallo' => 'dallo', + 'das' => 'das', + 'de' => 'de', + 'degli' => 'degli', + 'dei' => 'dei', + 'den' => 'den', + "de l '" => "de l '", + 'de la' => 'de la', + 'de las' => 'de las', + 'de le' => 'de le', + 'de los' => 'de los', + 'del' => 'del', + 'del coz' => 'del coz', + 'deli' => 'deli', + 'dell' => 'dell', + "dell'" => "dell'", + 'della' => 'della', + 'delle' => 'delle', + 'delli' => 'delli', + 'dello' => 'dello', 'der' => 'der', + 'des' => 'des', + 'di' => 'di', + 'dit' => 'dit', + 'do' => 'do', + 'do ceu' => 'do ceu', + 'don' => 'don', + 'don le' => 'don le', + 'dos' => 'dos', + 'dos santos' => 'dos santos', + 'du' => 'du', + 'dy' => 'dy', + 'el' => 'el', + 'g' => 'g', + 'gen' => 'gen', + 'gil' => 'gil', + 'gli' => 'gli', + 'grosse' => 'grosse', + 'groãÿe' => 'große', + 'i' => 'i', + 'im' => 'im', + 'in' => 'in', + 'in de' => 'in de', + 'in den' => 'in den', + 'in der' => 'in der', + 'in het' => 'in het', + "in't" => "in't", + 'kl' => 'kl', + 'kleine' => 'kleine', + 'l' => 'l', + 'l.' => 'l.', + "l'" => "l'", + 'la' => 'la', + 'le' => 'le', + 'lee' => 'lee', + 'li' => 'li', + 'lo' => 'lo', + 'm' => 'm', + 'mc' => 'mc', + 'mac' => 'mac', + 'n' => 'n', + 'o' => 'o', + "o'" => "o'", + 'op' => 'op', + 'op de' => 'op de', + 'op den' => 'op den', + 'op gen' => 'op gen', + 'op het' => 'op het', + 'op te' => 'op te', + 'op ten' => 'op ten', + 'oude' => 'oude', + 'pla' => 'pla', + 'pro' => 'pro', + 's' => 's', + 'st.' => 'st.', + 't' => 't', + 'te' => 'te', + 'ten' => 'ten', + 'ter' => 'ter', + 'thi' => 'thi', + 'tho' => 'tho', + 'thom' => 'thom', + 'thor' => 'thor', + 'thum' => 'thum', + 'to' => 'to', + 'tom' => 'tom', + 'tor' => 'tor', + 'tu' => 'tu', + 'tum' => 'tum', + 'unten' => 'unten', + 'unter' => 'unter', + 'unterm' => 'unterm', + 'v.' => 'v.', + 'v. d.' => 'v. d.', + 'v. dem' => 'v. dem', + 'v. den' => 'v. den', + 'v. der' => 'v. der', + 'v.d.' => 'v.d.', + 'v.dem' => 'v.dem', + 'v.den' => 'v.den', + 'v.der' => 'v.der', + 'van' => 'van', + 'van de' => 'van de', + 'van dem' => 'van dem', + 'van den' => 'van den', + 'van der' => 'van der', + 'vande' => 'vande', + 'vandem' => 'vandem', + 'vanden' => 'vanden', + 'vander' => 'vander', + 'van gen' => 'van gen', + 'van het' => 'van het', + 'van t' => 'van t', + 'ven' => 'ven', + 'ven der' => 'ven der', + 'ver' => 'ver', + 'vo' => 'vo', + 'vom' => 'vom', + 'vom und zu' => 'vom und zu', 'von' => 'von', + 'von und zu' => 'von und zu', + 'von und zu der' => 'von und zu der', + 'von und zur' => 'von und zur', + 'von de' => 'von de', + 'von dem' => 'von dem', + 'von den' => 'von den', + 'von der' => 'von der', + 'von la' => 'von la', + 'von zu' => 'von zu', + 'von zum' => 'von zum', + 'von zur' => 'von zur', + 'vonde' => 'vonde', + 'vonden' => 'vonden', + 'vondem' => 'vondem', + 'vonder' => 'vonder', + 'von einem' => 'von einem', + 'von mast' => 'von mast', + 'vor' => 'vor', + 'vor dem' => 'vor dem', + 'vor den' => 'vor den', + 'vor der' => 'vor der', + 'vorm' => 'vorm', + 'vorn' => 'vorn', + 'y' => 'y', + 'y del' => 'y del', + 'zu' => 'zu', + 'zum' => 'zum', + 'zur' => 'zur', + ]; + + /** + * the following list is according to HL7 (Health Level 7) standard + * @see https://www.vdek.com/vertragspartner/arbeitgeber/deuev/_jcr_content/par/download_8/file.res/Anlage_07_Vers.pdf + */ + const EXTENSIONS = [ // nobility predicate (Adelsprädikate) + 'bar' => 'Bar', + 'baron' => 'Baron', + 'baroness' => 'Baroness', + 'baronesse' => 'Baronesse', + 'baronin' => 'Baronin', + 'brand' => 'Brand', + 'burggraf' => 'Burggraf', + 'burggräfin' => 'Burggräfin', + 'condesa' => 'Condesa', + 'earl' => 'Earl', + 'edle' => 'Edle', + 'edler' => 'Edler', + 'erbgraf' => 'Erbgraf', + 'erbgräfin' => 'Erbgräfin', + 'erbprinz' => 'Erbprinz', + 'erbprinzessin' => 'Erbprinzessin', + 'ffr' => 'Ffr', + 'freifr' => 'Freifr', + 'freifräulein' => 'Freifräulein', + 'freifrau' => 'Freifrau', + 'freih' => 'Freih', + 'freiherr' => 'Freiherr', + 'freiin' => 'Freiin', + 'frf' => 'Frf', + 'frf.' => 'Frf.', + 'frfr' => 'Frfr', + 'frfr.' => 'Frfr.', + 'frh' => 'Frh', + 'frh.' => 'Frh.', + 'frhr' => 'Frhr', + 'frhr.' => 'Frhr.', + 'fst' => 'Fst', + 'fst.' => 'Fst.', + 'fstn' => 'Fstn', + 'fstn.' => 'Fstn.', + 'fürst' => 'Fürst', + 'fürstin' => 'Fürstin', + 'gr' => 'Gr', + 'graf' => 'Graf', + 'gräfin' => 'Gräfin', + 'grf' => 'Grf', + 'grfn' => 'Grfn', + 'grossherzog' => 'Grossherzog', + 'großherzog' => 'Großherzog', + 'grossherzogin' => 'Grossherzogin', + 'großherzogin' => 'Großherzogin', + 'herzog' => 'Herzog', + 'herzogin' => 'Herzogin', + 'jhr' => 'Jhr', + 'jhr.' => 'Jhr.', + 'jonkheer' => 'Jonkheer', + 'junker' => 'Junker', + 'landgraf' => 'Landgraf', + 'landgräfin' => 'Landgräfin', + 'markgraf' => 'Markgraf', + 'markgräfin' => 'Markgräfin', + 'marques' => 'Marques', + 'marquis' => 'Marquis', + 'marschall' => 'Marschall', + 'ostoja' => 'Ostoja', + 'prinz' => 'Prinz', + 'prinzessin' => 'Prinzessin', + 'przin' => 'Przin', + 'rabe' => 'Rabe', + 'reichsgraf' => 'Reichsgraf', + 'reichsgräfin' => 'Reichsgräfin', + 'ritter' => 'Ritter', + 'rr' => 'Rr', + 'truchsess' => 'Truchsess', + 'truchseß' => 'Truchseß', + ]; + + /** + * the following list contains the academic titles for doctor degrees + * from DACH (Germany, Austria, Swiss) + * @see https://de.wikipedia.org/wiki/Doktor + */ + const TITLES_DR = [ + 'ddr.' => 'DDr.', + 'dr.' => 'Dr.', + 'dr. e. h.' => 'Dr. E. h.', + 'dr.e.h.' => 'Dr.E.h.', + 'dr. ph' => 'Dr. PH', + 'dr.ph' => 'Dr.PH', + 'dr. sportwiss.' => 'Dr. Sportwiss.', + 'dr.sportwiss.' => 'Dr.Sportwiss.', + 'dr. agr.' => 'Dr. agr.', + 'dr.agr.' => 'Dr.agr.', + 'dr. biol.' => 'Dr. biol.', + 'dr.biol.' => 'Dr.biol.', + 'dr. cult.' => 'Dr. cult.', + 'dr.cult.' => 'Dr.cult.', + 'dr. des.' => 'Dr. des.', + 'dr.des.' => 'Dr.des.', + 'dr. diac.' => 'Dr. diac.', + 'dr.diac.' => 'Dr.diac.', + 'dr. disc. pol.' => 'Dr. disc. pol.', + 'dr.disc.pol.' => 'Dr.disc.pol.', + 'dr. e. h.' => 'Dr. e. h.', + 'dr.e.h.' => 'Dr.e.h.', + 'dr. eh.' => 'Dr. eh.', + 'dr.eh.' => 'Dr.eh.', + 'dr. h. c.' => 'Dr. h. c.', + 'dr.h.c.' => 'Dr.h.c.', + 'dr. h. c. mult.' => 'Dr. h. c. mult.', + 'dr.h.c.mult.' => 'Dr.h.c.mult.', + 'dr. habil.' => 'Dr. habil.', + 'dr.habil.' => 'Dr.habil.', + 'dr. iur.' => 'Dr. iur.', + 'dr.iur.' => 'Dr.iur.', + 'dr. iur. can.' => 'Dr. iur. can.', + 'dr.iur.can.' => 'Dr.iur.can.', + 'dr. iur. et rer. pol.' => 'Dr. iur. et rer. pol.', + 'dr.iur.etrer.pol.' => 'Dr.iur.etrer.pol.', + 'dr. iur. utr.' => 'Dr. iur. utr.', + 'dr.iur.utr.' => 'Dr.iur.utr.', + 'dr. math.' => 'Dr. math.', + 'dr.math.' => 'Dr.math.', + 'dr. med.' => 'Dr. med.', + 'dr.med.' => 'Dr.med.', + 'dr. med. dent.' => 'Dr. med. dent.', + 'dr.med.dent.' => 'Dr.med.dent.', + 'dr. med. dent. sci.' => 'Dr. med. dent. sci.', + 'dr.med.dent.sci.' => 'Dr.med.dent.sci.', + 'dr. med. sci.' => 'Dr. med. sci.', + 'dr.med.sci.' => 'Dr.med.sci.', + 'dr. med. univ.' => 'Dr. med. univ.', + 'dr.med.univ.' => 'Dr.med.univ.', + 'dr. med. univ. et scient. med.' => 'Dr. med. univ. et scient. med.', + 'dr.med.univ.etscient.med.' => 'Dr.med.univ.etscient.med.', + 'dr. med. vet.' => 'Dr. med. vet.', + 'dr.med.vet.' => 'Dr.med.vet.', + 'dr. mont.' => 'Dr. mont.', + 'dr.mont.' => 'Dr.mont.', + 'dr. mult.' => 'Dr. mult.', + 'dr.mult.' => 'Dr.mult.', + 'dr. nat. med.' => 'Dr. nat. med.', + 'dr.nat.med.' => 'Dr.nat.med.', + 'dr. nat. oec.' => 'Dr. nat. oec.', + 'dr.nat.oec.' => 'Dr.nat.oec.', + 'dr. nat. techn.' => 'Dr. nat. techn.', + 'dr.nat.techn.' => 'Dr.nat.techn.', + 'dr. oec.' => 'Dr. oec.', + 'dr.oec.' => 'Dr.oec.', + 'dr. oec. hsg' => 'Dr. oec. HSG', + 'dr.oec.hsg' => 'Dr.oec.HSG', + 'dr. oec. publ.' => 'Dr. oec. publ.', + 'dr.oec.publ.' => 'Dr.oec.publ.', + 'dr. oec. troph.' => 'Dr. oec. troph.', + 'dr.oec.troph.' => 'Dr.oec.troph.', + 'dr. paed.' => 'Dr. paed.', + 'dr.paed.' => 'Dr.paed.', + 'dr. pharm.' => 'Dr. pharm.', + 'dr.pharm.' => 'Dr.pharm.', + 'dr. phil.' => 'Dr. phil.', + 'dr.phil.' => 'Dr.phil.', + 'dr. phil. fac. theol.' => 'Dr. phil. fac. theol.', + 'dr.phil.fac.theol.' => 'Dr.phil.fac.theol.', + 'dr. phil. in art.' => 'Dr. phil. in art.', + 'dr.phil.inart.' => 'Dr.phil.inart.', + 'dr. phil. nat.' => 'Dr. phil. nat.', + 'dr.phil.nat.' => 'Dr.phil.nat.', + 'dr. rer. agr.' => 'Dr. rer. agr.', + 'dr.rer.agr.' => 'Dr.rer.agr.', + 'dr. rer. biol. hum.' => 'Dr. rer. biol. hum.', + 'dr.rer.biol.hum.' => 'Dr.rer.biol.hum.', + 'dr. rer. biol. vet.' => 'Dr. rer. biol. vet.', + 'dr.rer.biol.vet.' => 'Dr.rer.biol.vet.', + 'dr. rer. cam.' => 'Dr. rer. cam.', + 'dr.rer.cam.' => 'Dr.rer.cam.', + 'dr. rer. comm.' => 'Dr. rer. comm.', + 'dr.rer.comm.' => 'Dr.rer.comm.', + 'dr. rer. cult.' => 'Dr. rer. cult.', + 'dr.rer.cult.' => 'Dr.rer.cult.', + 'dr. rer. cur.' => 'Dr. rer. cur.', + 'dr.rer.cur.' => 'Dr.rer.cur.', + 'dr. rer. forest.' => 'Dr. rer. forest.', + 'dr.rer.forest.' => 'Dr.rer.forest.', + 'dr. rer. hort.' => 'Dr. rer. hort.', + 'dr.rer.hort.' => 'Dr.rer.hort.', + 'dr. rer. hum.' => 'Dr. rer. hum.', + 'dr.rer.hum.' => 'Dr.rer.hum.', + 'dr. rer. med.' => 'Dr. rer. med.', + 'dr.rer.med.' => 'Dr.rer.med.', + 'dr. rer. medic.' => 'Dr. rer. medic.', + 'dr.rer.medic.' => 'Dr.rer.medic.', + 'dr. rer. merc.' => 'Dr. rer. merc.', + 'dr.rer.merc.' => 'Dr.rer.merc.', + 'dr. rer. mil.' => 'Dr. rer. mil.', + 'dr.rer.mil.' => 'Dr.rer.mil.', + 'dr. rer. mont.' => 'Dr. rer. mont.', + 'dr.rer.mont.' => 'Dr.rer.mont.', + 'dr. rer. nat.' => 'Dr. rer. nat.', + 'dr.rer.nat.' => 'Dr.rer.nat.', + 'dr. rer. oec.' => 'Dr. rer. oec.', + 'dr.rer.oec.' => 'Dr.rer.oec.', + 'dr. rer. physiol.' => 'Dr. rer. physiol.', + 'dr.rer.physiol.' => 'Dr.rer.physiol.', + 'dr. rer. pol.' => 'Dr. rer. pol.', + 'dr.rer.pol.' => 'Dr.rer.pol.', + 'dr. rer. publ.' => 'Dr. rer. publ.', + 'dr.rer.publ.' => 'Dr.rer.publ.', + 'dr. rer. publ. hsg' => 'Dr. rer. publ. HSG', + 'dr.rer.publ.hsg' => 'Dr.rer.publ.HSG', + 'dr. rer. rel.' => 'Dr. rer. rel.', + 'dr.rer.rel.' => 'Dr.rer.rel.', + 'dr. rer. sec.' => 'Dr. rer. sec.', + 'dr.rer.sec.' => 'Dr.rer.sec.', + 'dr. rer. silv.' => 'Dr. rer. silv.', + 'dr.rer.silv.' => 'Dr.rer.silv.', + 'dr. rer. soc.' => 'Dr. rer. soc.', + 'dr.rer.soc.' => 'Dr.rer.soc.', + 'dr. rer. soc. oec.' => 'Dr. rer. soc. oec.', + 'dr.rer.soc.oec.' => 'Dr.rer.soc.oec.', + 'dr. rer. tech.' => 'Dr. rer. tech.', + 'dr.rer.tech.' => 'Dr.rer.tech.', + 'dr. sc.' => 'Dr. sc.', + 'dr.sc.' => 'Dr.sc.', + 'dr. sc. eth zürich' => 'Dr. sc. ETH Zürich', + 'dr.sc.eth zürich' => 'Dr.sc.ETH Zürich', + 'dr. sc. agr.' => 'Dr. sc. agr.', + 'dr.sc.agr.' => 'Dr.sc.agr.', + 'dr. sc. hum.' => 'Dr. sc. hum.', + 'dr.sc.hum.' => 'Dr.sc.hum.', + 'dr. sc. inf.' => 'Dr. sc. inf.', + 'dr.sc.inf.' => 'Dr.sc.inf.', + 'dr. sc. inf. biomed.' => 'Dr. sc. inf. biomed.', + 'dr.sc.inf.biomed.' => 'Dr.sc.inf.biomed.', + 'dr. sc. inf. med.' => 'Dr. sc. inf. med.', + 'dr.sc.inf.med.' => 'Dr.sc.inf.med.', + 'dr. sc. math.' => 'Dr. sc. math.', + 'dr.sc.math.' => 'Dr.sc.math.', + 'dr. sc. med.' => 'Dr. sc. med.', + 'dr.sc.med.' => 'Dr.sc.med.', + 'dr. sc. mus.' => 'Dr. sc. mus.', + 'dr.sc.mus.' => 'Dr.sc.mus.', + 'dr. sc. nat.' => 'Dr. sc. nat.', + 'dr.sc.nat.' => 'Dr.sc.nat.', + 'dr. sc. oec.' => 'Dr. sc. oec.', + 'dr.sc.oec.' => 'Dr.sc.oec.', + 'dr. sc. pol.' => 'Dr. sc. pol.', + 'dr.sc.pol.' => 'Dr.sc.pol.', + 'dr. sc. rel.' => 'Dr. sc. rel.', + 'dr.sc.rel.' => 'Dr.sc.rel.', + 'dr. sc. soc.' => 'Dr. sc. soc.', + 'dr.sc.soc.' => 'Dr.sc.soc.', + 'dr. sc. techn.' => 'Dr. sc. techn.', + 'dr.sc.techn.' => 'Dr.sc.techn.', + 'dr. scient. med' => 'Dr. scient. med', + 'dr.scient.med' => 'Dr.scient.med', + 'dr. techn.' => 'Dr. techn.', + 'dr.techn.' => 'Dr.techn.', + 'dr. theol.' => 'Dr. theol.', + 'dr.theol.' => 'Dr.theol.', + 'dr. troph.' => 'Dr. troph.', + 'dr.troph.' => 'Dr.troph.', + 'dr.-ing.' => 'Dr.-Ing.', + 'ph. d.' => 'Ph. D.', + 'ph.d.' => 'Ph.D.', + ]; + + /** + * the following list contains the academaic titles mainly for professors + * from DACH (Germany, Austria, Swiss), which are often used in names as titles + * @see https://de.wikipedia.org/wiki/Professor + * this list is kept separate from TITLES to better maintain both + */ + const OFFICIAL_TITLES = [ + 'ao. univ.-prof.' => 'ao. Univ.-Prof.', + 'ao.univ.-prof.' => 'ao.Univ.-Prof.', + 'apl. prof.' => 'apl. Prof.', + 'apl.prof.' => 'apl.Prof.', + 'ass.-prof.' => 'Ass.-Prof.', + 'assoz. prof.' => 'assoz. Prof.', + 'assoz.prof.' => 'assoz.Prof.', + 'hon.-prof.' => 'Hon.-Prof.', + 'jun.-prof.' => 'Jun.-Prof.', + 'o. univ.-prof.' => 'o. Univ.-Prof.', + 'o.univ.-prof.' => 'o.Univ.-Prof.', + 'o.ö. prof.' => 'o.ö. Prof.', + 'o.ö.prof.' => 'o.ö.Prof.', + 'pd' => 'PD', + 'priv.-doz.' => 'Priv.-Doz.', + 'prof. em.' => 'Prof. em.', + 'prof. emer.' => 'Prof. emer.', + 'prof. h. c.' => 'Prof. h. c.', + 'prof. hon.' => 'Prof. hon.', + 'prof. i. k.' => 'Prof. i. K.', + 'prof.' => 'Prof.', + 'prof.em.' => 'Prof.em.', + 'prof.emer.' => 'Prof.emer.', + 'prof.h.c.' => 'Prof.h.c.', + 'prof.hon.' => 'Prof.hon.', + 'prof.i.k.' => 'Prof.i.K.', + 'professor' => 'Professor', + 'professorin' => 'Professorin', + 'tit. prof.' => 'Tit. Prof.', + 'tit.prof.' => 'Tit.Prof.', + 'univ.-prof.' => 'Univ.-Prof.', + ]; + + const JOB_TITLES = [ + 'dipl.-ing.' => 'Dipl.-Ing.', + 'ra' => 'RA', + ]; + + const COMPANIES = [ + ' - ' => ' - ', + '& co.' => '& Co.', + '&' => '&', + '+' => '+', + 'ag & co. kg' => 'AG & Co. KG', + 'ag & co. kgaa' => 'AG & Co. KGaA', + 'ag' => 'AG', + 'agentur' => 'Agentur', + 'aktien' => 'Aktien', + 'apotheke' => 'Apotheke', + 'bank' => 'Bank', + 'bkk' => 'BKK', + 'büro' => 'büro', + 'centrum' => 'Centrum', + 'dienst' => 'Dienst', + 'direct' => 'Direct', + 'direkt' => 'Direkt', + 'e. g.' => 'e. G.', + 'e. k.' => 'e. K.', + 'e. kfm' => 'e. Kfm', + 'e. kfr' => 'e. Kfr', + 'e.g.' => 'e.G.', + 'e.k.' => 'e.K.', + 'e.v.' => 'e.V.', + 'eg' => 'eG', + 'eingetragene kauffrau' => 'eingetragene Kauffrau', + 'eingetragener kaufmann' => 'eingetragener Kaufmann', + 'elektro' => 'Elektro', + 'finanz' => 'Finanz', + 'friseur' => 'Friseur', + 'frisier' => 'Frisier', + 'gbr' => 'GbR', + 'gen.' => 'Gen.', + 'genossenschaft' => 'Genossenschaft', + 'ges. m. b. h.' => 'Ges. m. b. H.', + 'ges.' => 'Ges.', + 'ges.m.b.h.' => 'Ges.m.b.H.', + 'gesellschaft' => 'Gesellschaft', + 'ggmbh' => 'gGmbH', + 'gmbh & co. kg' => 'GmbH & Co. KG', + 'gmbh & co. kgaa' => 'GmbH & Co. KGaA', + 'gmbh & co. ohg' => 'GmbH & Co. OHG', + 'gmbh' => 'GmbH', + 'gymnasium' => 'Gymnasium', + 'hotel' => 'Hotel', + 'i. g.' => 'i. G.', + 'i.g.' => 'i.G.', + 'kg' => 'KG', + 'kgaa' => 'KGaA', + 'krankenhaus' => 'Krankenhaus', + 'krankenkasse' => 'Krankenkasse', + 'llc & co. kg' => 'LLC & Co. KG', + 'mbh' => 'mbH', + 'ohg' => 'OHG', + 'polizei' => 'Polizei', + 'praxis' => 'Praxis', + 'restaurant' => 'Restaurant', + 'salon' => 'salon', + 'schule' => 'Schule', + ' se' => ' SE', + 'service' => 'Service', + 'steuer' => 'Steuer', + 'stiftung' => 'Stiftung', + 'technik' => 'Technik', + 'theater' => 'Theater', + 'ug (haftungsbeschränkt)' => 'UG (haftungsbeschränkt)', + 'und co. kg' => 'und Co. KG', + 'und co.' => 'und Co.', + 'universität' => 'Universität', + 'unternehmergesellschaft (haftungsbeschränkt)' => 'Unternehmergesellschaft (haftungsbeschränkt)', + 'verband' => 'Verband', + 'verein' => 'Verein', + 'versicherung' => 'Versicherung', + 'verwaltung' => 'Verwaltung', + 'vvag' => 'VVaG', + 'zentrum' => 'Zentrum', ]; public function getSuffixes(): array @@ -45,4 +655,19 @@ public function getLastnamePrefixes(): array { return self::LASTNAME_PREFIXES; } + + public function getExtensions(): array + { + return self::EXTENSIONS; + } + + public function getTitles(): array + { + return array_merge(self::TITLES_DR, self::OFFICIAL_TITLES, self::JOB_TITLES); + } + + public function getCompanies(): array + { + return self::COMPANIES; + } } diff --git a/src/LanguageInterface.php b/src/LanguageInterface.php index d4d8bd8..3002884 100644 --- a/src/LanguageInterface.php +++ b/src/LanguageInterface.php @@ -9,4 +9,10 @@ public function getSuffixes(): array; public function getLastnamePrefixes(): array; public function getSalutations(): array; + + public function getExtensions(): array; + + public function getTitles(): array; + + public function getCompanies(): array; } diff --git a/src/Mapper/AbstractMapper.php b/src/Mapper/AbstractMapper.php index 68811ca..c08a827 100644 --- a/src/Mapper/AbstractMapper.php +++ b/src/Mapper/AbstractMapper.php @@ -19,7 +19,7 @@ abstract public function map(array $parts); * checks if there are still unmapped parts left before the given position * * @param array $parts - * @param $index + * @param int $index * @return bool */ protected function hasUnmappedPartsBefore(array $parts, $index): bool @@ -65,4 +65,21 @@ protected function getKey($word): string { return strtolower(str_replace('.', '', $word)); } + + /** + * sort array by length descending and alphabetically ascending + * https://stackoverflow.com/questions/44835807/sort-array-by-length-and-then-alphabetically + * + * @param array $values + * @return array + */ + protected function sortArrayDescending(array $values): array + { + uasort($values, function($a, $b) { + return strlen($b) <=> strlen($a) ?: strcmp($a, $b);; + }); + + return $values; + } } + diff --git a/src/Mapper/CompanyMapper.php b/src/Mapper/CompanyMapper.php new file mode 100644 index 0000000..ef8c14d --- /dev/null +++ b/src/Mapper/CompanyMapper.php @@ -0,0 +1,40 @@ +companies = $this->sortArrayDescending($companies); + } + + /** + * map companies in the full name + * + * @param array $parts = the fullname + * @return array + */ + public function map(array $parts): array + { + if (count($parts) <> 1) { + return []; + } + + foreach ($this->companies as $company) { + if (strpos($parts[0], $company) !== false) { + return [new Company($parts[0])]; + } + } + + return []; + } +} diff --git a/src/Mapper/ExtensionMapper.php b/src/Mapper/ExtensionMapper.php new file mode 100644 index 0000000..10c7412 --- /dev/null +++ b/src/Mapper/ExtensionMapper.php @@ -0,0 +1,45 @@ +extensions = $this->sortArrayDescending($extensions); + } + + /** + * map extensions in the parts array + * + * @param array $parts the name parts + * @return array the mapped parts + */ + public function map(array $parts): array + { + foreach ($parts as $key => $part) { + if ($part instanceof AbstractPart) { + continue; + } + if (in_array($part, $this->extensions)) { + $parts[$key] = new Extension($parts[$key]); + } + } + + return $parts; + } +} \ No newline at end of file diff --git a/src/Mapper/FirstnameMapper.php b/src/Mapper/FirstnameMapper.php index 79226b4..68fbd89 100644 --- a/src/Mapper/FirstnameMapper.php +++ b/src/Mapper/FirstnameMapper.php @@ -32,8 +32,8 @@ public function map(array $parts): array } /** - * @param $part - * @return Firstname + * @param string|AbstractPart $part + * @return AbstractPart */ protected function handleSinglePart($part): AbstractPart { diff --git a/src/Mapper/LastnameMapper.php b/src/Mapper/LastnameMapper.php index 43cfdc7..6c28112 100644 --- a/src/Mapper/LastnameMapper.php +++ b/src/Mapper/LastnameMapper.php @@ -166,7 +166,7 @@ protected function shouldStopMapping(array $parts, int $k): bool /** * indicates if the given part should be ignored (skipped) during mapping * - * @param $part + * @param string|AbstractPart $part * @return bool */ protected function isIgnoredPart($part) { @@ -248,8 +248,8 @@ protected function isPrefix($word): bool /** * find the next non-nickname index in parts * - * @param $parts - * @param $startIndex + * @param array $parts + * @param int $startIndex * @return int|void */ protected function skipNicknameParts($parts, $startIndex) diff --git a/src/Mapper/MiddlenameMapper.php b/src/Mapper/MiddlenameMapper.php index be129b7..ff9dd2e 100644 --- a/src/Mapper/MiddlenameMapper.php +++ b/src/Mapper/MiddlenameMapper.php @@ -41,9 +41,9 @@ public function map(array $parts): array } /** - * @param $start - * @param $parts - * @return mixed + * @param int $start + * @param array $parts + * @return array */ protected function mapFrom($start, $parts): array { diff --git a/src/Mapper/MultipartMapper.php b/src/Mapper/MultipartMapper.php new file mode 100644 index 0000000..e48ea2c --- /dev/null +++ b/src/Mapper/MultipartMapper.php @@ -0,0 +1,103 @@ +sampleType = $sampleType; + + if (strtolower($sampleType) == 'prefix') { + $this->className = 'TheIconic\\NameParser\\Part\\Lastname' . ucfirst($sampleType); + } else { + $this->className = 'TheIconic\\NameParser\\Part\\' . ucfirst($sampleType); + } + + $values = []; + + $samples = $this->sortArrayDescending($samples); + foreach ($samples as $key => $sample) { // fragmentation of the strings into words or abbreviations + $fragments = explode(' ', $sample); + $values[$key][$sampleType] = $sample; + $values[$key]['fragments'] = $fragments; + } + $this->samples = $values; + } + + /** + * map composite name components in the parts array + * + * @param array $parts the name parts + * @return array the mapped parts + */ + public function map(array $parts): array + { + foreach ($this->samples as $sample) { + $mappedParts = []; + foreach ($sample['fragments'] as $fragment) { + $result = array_search($fragment, $parts); + if ($result !== false) { + $mappedParts[] = $result; + } else { + continue(2); + } + } + if (count($result = $this->mapParts($mappedParts, $parts))) { + $parts = $result; // all sample fragments successful mapped to parts + break; + } + } + + return $parts; + } + + /** + * map sample fragments to parts + * + * @param array $mappedParts + * @param array $parts + * @return array + */ + public function mapParts(array $mappedParts, array $parts): array + { + $className = $this->className; + foreach ($mappedParts as $key) { + if ($parts[$key] instanceof AbstractPart) { + return []; + } + $parts[$key] = new $className($parts[$key]); + } + + return $parts; + } + + /** + * get sampleType + * + * @return string + */ + public function getSampleType() + { + return $this->sampleType; + } +} diff --git a/src/Mapper/SuffixMapper.php b/src/Mapper/SuffixMapper.php index 49206cc..6a492cd 100644 --- a/src/Mapper/SuffixMapper.php +++ b/src/Mapper/SuffixMapper.php @@ -49,7 +49,7 @@ public function map(array $parts): array } /** - * @param $parts + * @param array $parts * @return bool */ protected function isMatchingSinglePart($parts): bool @@ -66,7 +66,7 @@ protected function isMatchingSinglePart($parts): bool } /** - * @param $part + * @param string|AbstractPart $part * @return bool */ protected function isSuffix($part): bool diff --git a/src/Name.php b/src/Name.php index 15d2679..1d7ac06 100644 --- a/src/Name.php +++ b/src/Name.php @@ -66,12 +66,15 @@ public function getAll(bool $format = false): array $results = []; $keys = [ 'salutation' => [], + 'title' => [], 'firstname' => [], 'nickname' => [$format], 'middlename' => [], 'initials' => [], + 'extension' => [], 'lastname' => [], 'suffix' => [], + 'company' => [], ]; foreach ($keys as $key => $args) { @@ -191,6 +194,71 @@ public function getMiddlename(): string return $this->export('Middlename'); } + /** + * get the company + * + * @return string + */ + public function getCompany(): string + { + return $this->export('Company'); + } + + /** + * get the extension + * + * @return string + */ + public function getExtension(): string + { + return $this->export('Extension'); + } + + /** + * get the titles(s) + * + * @return string + */ + public function getTitle(): string + { + return $this->export('Title'); + } + + /** + * get an array with well formated names and their separators, + * where the keys are representing vCard properties + * @see https://tools.ietf.org/html/rfc6350#section-6.2.2 + * + * @return array + */ + public function getVCardArray(): array + { + return [ + 'FN' => implode(' ', array_diff_key($this->getAll(), [ + 'nickname' => $this->getNickname(), // fullname with stripped off nickname + ])), + 'N' => implode(';', array_filter([ // RFC6350: five segments in sequence: + $this->getLastname(true), // 1. Family Names (also known as surnames) + $this->getFirstname(), // 2. Given Names + implode(',', array_filter([ // 3. Additional Names + str_replace(' ', ',', $this->getMiddlename()), + $this->getInitials(), + ])), + implode(',', array_filter([ // 4. Honorific Prefixes + $this->getSalutation(), + $this->getTitle(), + ])), + implode(',', array_filter([ // 5. Honorific Suffixes + $this->getExtension(), + $this->getLastnamePrefix(), + $this->getSuffix(), + ])), + ])), + 'NICKNAME' => $this->getNickname(), + 'ORG' => $this->getCompany(), + ]; + } + /** * helper method used by getters to extract and format relevant name parts * diff --git a/src/Parser.php b/src/Parser.php index f2040b0..eb65b1a 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -10,6 +10,9 @@ use TheIconic\NameParser\Mapper\LastnameMapper; use TheIconic\NameParser\Mapper\FirstnameMapper; use TheIconic\NameParser\Mapper\MiddlenameMapper; +use TheIconic\NameParser\Mapper\CompanyMapper; +use TheIconic\NameParser\Mapper\ExtensionMapper; +use TheIconic\NameParser\Mapper\MultipartMapper; class Parser { @@ -59,6 +62,9 @@ public function __construct(array $languages = []) * - middle initials * - surname / last name * - suffix (II, Phd, Jr, etc) + * - extension (Germany: nobility predicate is part of lastname) + * - title (Germany: academic titles are usually used as name parts between salutation and given name) + * - company (the string contains typical characteristics for a company name and is returned identically) * * @param string $name * @return Name @@ -71,6 +77,11 @@ public function parse($name): Name if (1 < count($segments)) { return $this->parseSplitName($segments[0], $segments[1], $segments[2] ?? ''); + } else { + $mapped = $this->getCompany($name); + if (count($mapped)) { + return new Name($mapped); + } } $parts = explode(' ', $name); @@ -85,9 +96,9 @@ public function parse($name): Name /** * handles split-parsing of comma-separated name parts * - * @param $left - the name part left of the comma - * @param $right - the name part right of the comma - * + * @param string $first - the name part left of the comma + * @param string $second - the name part right of the comma + * @param string $third * @return Name */ protected function parseSplitName($first, $second, $third): Name @@ -109,9 +120,12 @@ protected function getFirstSegmentParser(): Parser $parser = new Parser(); $parser->setMappers([ - new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), - new SuffixMapper($this->getSuffixes(), false, 2), - new LastnameMapper($this->getPrefixes(), true), + new ExtensionMapper($this->getSamples('Extensions')), + new MultipartMapper($this->getSamples('Titles'), 'title'), + new MultipartMapper($this->getSamples('LastnamePrefixes'), 'prefix'), + new SalutationMapper($this->getSamples('Salutations'), $this->getMaxSalutationIndex()), + new SuffixMapper($this->getSamples('Suffixes'), false, 2), + new LastnameMapper($this->getSamples('LastnamePrefixes'), true), new FirstnameMapper(), new MiddlenameMapper(), ]); @@ -127,8 +141,11 @@ protected function getSecondSegmentParser(): Parser $parser = new Parser(); $parser->setMappers([ - new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), - new SuffixMapper($this->getSuffixes(), true, 1), + new ExtensionMapper($this->getSamples('Extensions')), + new MultipartMapper($this->getSamples('Titles'), 'title'), + new MultipartMapper($this->getSamples('LastnamePrefixes'), 'prefix'), + new SalutationMapper($this->getSamples('Salutations'), $this->getMaxSalutationIndex()), + new SuffixMapper($this->getSamples('Suffixes'), true, 1), new NicknameMapper($this->getNicknameDelimiters()), new InitialMapper($this->getMaxCombinedInitials(), true), new FirstnameMapper(), @@ -143,7 +160,7 @@ protected function getThirdSegmentParser(): Parser $parser = new Parser(); $parser->setMappers([ - new SuffixMapper($this->getSuffixes(), true, 0), + new SuffixMapper($this->getSamples('Suffixes'), true, 0), ]); return $parser; @@ -158,11 +175,14 @@ public function getMappers(): array { if (empty($this->mappers)) { $this->setMappers([ + new ExtensionMapper($this->getSamples('Extensions')), + new MultipartMapper($this->getSamples('Titles'), 'title'), + new MultipartMapper($this->getSamples('LastnamePrefixes'), 'prefix'), new NicknameMapper($this->getNicknameDelimiters()), - new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), - new SuffixMapper($this->getSuffixes()), + new SalutationMapper($this->getSamples('Salutations'), $this->getMaxSalutationIndex()), + new SuffixMapper($this->getSamples('Suffixes')), new InitialMapper($this->getMaxCombinedInitials()), - new LastnameMapper($this->getPrefixes()), + new LastnameMapper($this->getSamples('LastnamePrefixes')), new FirstnameMapper(), new MiddlenameMapper(), ]); @@ -171,6 +191,19 @@ public function getMappers(): array return $this->mappers; } + /** + * get name as company if parts matches company identifiers + * + * @param string $name + * @return array + */ + protected function getCompany(string $name): array + { + $mapper = new CompanyMapper($this->getSamples('Companies')); + + return $mapper->map([$name]); + } + /** * set the mappers for this parser * @@ -212,7 +245,7 @@ public function getWhitespace(): string /** * set the string of characters that are supposed to be treated as whitespace * - * @param $whitespace + * @param string $whitespace * @return Parser */ public function setWhitespace($whitespace): Parser @@ -225,46 +258,15 @@ public function setWhitespace($whitespace): Parser /** * @return array */ - protected function getPrefixes() + protected function getSamples(string $sampleName): array { - $prefixes = []; - - /** @var LanguageInterface $language */ - foreach ($this->languages as $language) { - $prefixes += $language->getLastnamePrefixes(); - } - - return $prefixes; - } - - /** - * @return array - */ - protected function getSuffixes() - { - $suffixes = []; - - /** @var LanguageInterface $language */ - foreach ($this->languages as $language) { - $suffixes += $language->getSuffixes(); - } - - return $suffixes; - } - - /** - * @return array - */ - protected function getSalutations() - { - $salutations = []; - - /** @var LanguageInterface $language */ + $samples = []; + $method = sprintf('get%s', $sampleName); foreach ($this->languages as $language) { - $salutations += $language->getSalutations(); + $samples += call_user_func_array([$language, $method], []); } - return $salutations; + return $samples; } /** diff --git a/src/Part/AbstractPart.php b/src/Part/AbstractPart.php index 51e6372..1b396f0 100644 --- a/src/Part/AbstractPart.php +++ b/src/Part/AbstractPart.php @@ -12,7 +12,7 @@ abstract class AbstractPart /** * constructor allows passing the value to wrap * - * @param $value + * @param string|AbstractPart $value */ public function __construct($value) { @@ -61,8 +61,8 @@ public function normalize(): string * helper for camelization of values * to be used during normalize * - * @param $word - * @return mixed + * @param string $word + * @return string */ protected function camelcase($word): string { @@ -76,7 +76,7 @@ protected function camelcase($word): string /** * camelcasing callback * - * @param $matches + * @param string $matches * @return string */ protected function camelcaseReplace($matches): string @@ -84,7 +84,7 @@ protected function camelcaseReplace($matches): string if (function_exists('mb_convert_case')) { return mb_convert_case($matches[0], MB_CASE_TITLE, 'UTF-8'); } - + return ucfirst(strtolower($matches[0])); } } diff --git a/src/Part/Company.php b/src/Part/Company.php new file mode 100644 index 0000000..1b9524c --- /dev/null +++ b/src/Part/Company.php @@ -0,0 +1,7 @@ +