Module:Citation/CS1/Identifiers/sandbox: Difference between revisions
| Richardpruen (talk | contribs) m 1 revision imported | No edit summary | ||
| Line 1: | Line 1: | ||
| --[[ | --[[ | ||
| History of changes since last sync:  | History of changes since last sync: 2022-01-22 | ||
| 2022-09-23: catch 1 & 2 digit doi registrants with subcode; see Help_talk:Citation_Style_1#Fails_to_throw_a_DOI_error | |||
| ]] | ]] | ||
| Line 189: | Line 187: | ||
| This function does not work if it is fed month names for languages other than English.  Wikimedia #time: parser | This function does not work if it is fed month names for languages other than English.  Wikimedia #time: parser | ||
| apparently doesn't understand non-English date month names. This function will always return false when the date | apparently doesn't understand non-English date month names. This function will always return false when the date | ||
| contains a non-English month name because good1 is false after the call to  | contains a non-English month name because good1 is false after the call to lang_object.formatDate().  To get | ||
| call this function with YYYY-MM-DD format  | around that call this function with date parts and create a YYYY-MM-DD format date. | ||
| ]=] | ]=] | ||
| local function is_valid_biorxiv_date (biorxiv_date) | local function is_valid_biorxiv_date (y, m, d) | ||
| 	local biorxiv_date = table.concat ({y, m, d}, '-');							-- make ymd date | |||
| 	local good1, good2; | 	local good1, good2; | ||
| 	local biorxiv_ts, tomorrow_ts;												-- to hold Unix timestamps representing the dates | 	local biorxiv_ts, tomorrow_ts;												-- to hold Unix timestamps representing the dates | ||
| Line 259: | Line 258: | ||
| --[[--------------------------< N O R M A L I Z E _ L C C N >-------------------------------------------------- | --[[--------------------------< N O R M A L I Z E _ L C C N >-------------------------------------------------- | ||
| LCCN normalization ( | LCCN normalization (https://www.loc.gov/marc/lccn-namespace.html#normalization) | ||
| 1. Remove all blanks. | 1. Remove all blanks. | ||
| 2. If there is a forward slash (/) in the string, remove it, and remove all characters to the right of the forward slash. | 2. If there is a forward slash (/) in the string, remove it, and remove all characters to the right of the forward slash. | ||
| Line 269: | Line 268: | ||
| Returns a normalized LCCN for lccn() to validate.  There is no error checking (step 3.b.1) performed in this function. | Returns a normalized LCCN for lccn() to validate.  There is no error checking (step 3.b.1) performed in this function. | ||
| ]] | ]] | ||
| Line 295: | Line 295: | ||
| --[[--------------------------< A R X I V >-------------------------------------------------------------------- | --[[--------------------------< A R X I V >-------------------------------------------------------------------- | ||
| See:  | See: https://arxiv.org/help/arxiv_identifier | ||
| format and error check arXiv identifier.  There are three valid forms of the identifier: | format and error check arXiv identifier.  There are three valid forms of the identifier: | ||
| Line 320: | Line 320: | ||
| 	<date code> and <version> are as defined for 0704-1412 | 	<date code> and <version> are as defined for 0704-1412 | ||
| 	<number> is a five-digit number | 	<number> is a five-digit number | ||
| ]] | ]] | ||
| Line 388: | Line 389: | ||
| Validates (sort of) and formats a bibcode ID. | Validates (sort of) and formats a bibcode ID. | ||
| Format for bibcodes is specified here:  | Format for bibcodes is specified here: https://adsabs.harvard.edu/abs_doc/help_pages/data.html#bibcodes | ||
| But, this: 2015arXiv151206696F is apparently valid so apparently, the only things that really matter are length, 19 characters | But, this: 2015arXiv151206696F is apparently valid so apparently, the only things that really matter are length, 19 characters | ||
| Line 473: | Line 474: | ||
| 			if m then															-- m is nil when id is the six-digit form | 			if m then															-- m is nil when id is the six-digit form | ||
| 				if not is_valid_biorxiv_date (y  | 				if not is_valid_biorxiv_date (y, m, d) then						-- validate the encoded date; TODO: don't ignore leap-year and actual month lengths ({{#time:}} is a poor date validator) | ||
| 					break;														-- date fail; break out early so we don't unset the error message | 					break;														-- date fail; break out early so we don't unset the error message | ||
| 				end | 				end | ||
| Line 480: | Line 481: | ||
| 			break;																-- and done | 			break;																-- and done | ||
| 		end | 		end | ||
| 	end	 | 	end																			-- err_cat remains set here when no match | ||
| 	if err_msg then | 	if err_msg then | ||
| Line 498: | Line 499: | ||
| The description of the structure of this identifier can be found at Help_talk:Citation_Style_1/Archive_26#CiteSeerX_id_structure | The description of the structure of this identifier can be found at Help_talk:Citation_Style_1/Archive_26#CiteSeerX_id_structure | ||
| ]] | ]] | ||
| Line 533: | Line 535: | ||
| and terminal punctuation may not be technically correct but it appears, that in practice these characters are rarely | and terminal punctuation may not be technically correct but it appears, that in practice these characters are rarely | ||
| if ever used in DOI names. | if ever used in DOI names. | ||
| https://www.doi.org/doi_handbook/2_Numbering.html				-- 2.2 Syntax of a DOI name | |||
| https://www.doi.org/doi_handbook/2_Numbering.html#2.2.2			-- 2.2.2 DOI prefix | |||
| ]] | ]] | ||
| Line 579: | Line 584: | ||
| 		'^[^1-9]%d%d%d$',														-- 4 digits without subcode (0xxx); accepts: 1000–9999 | 		'^[^1-9]%d%d%d$',														-- 4 digits without subcode (0xxx); accepts: 1000–9999 | ||
| 		'^%d%d%d%d%d%d+',														-- 6 or more digits | 		'^%d%d%d%d%d%d+',														-- 6 or more digits | ||
| 		'^%d%d?%d?$',															-- less than 4 digits without subcode (with subcode is legitimate) | 		'^%d%d?%d?$',															-- less than 4 digits without subcode (3 digits with subcode is legitimate) | ||
| 		'^%d%d?%.[%d%.]+',														-- 1 or 2 digits with subcode | |||
| 		'^5555$',																-- test registrant will never resolve | 		'^5555$',																-- test registrant will never resolve | ||
| 		'[^%d%.]',																-- any character that isn't a digit or a dot | 		'[^%d%.]',																-- any character that isn't a digit or a dot | ||
| Line 627: | Line 633: | ||
| if ever used in HDLs. | if ever used in HDLs. | ||
| Query string parameters are named here:  | Query string parameters are named here: https://www.handle.net/proxy_servlet.html.  query strings are not displayed | ||
| but since '?' is an allowed character in an HDL, '?' followed by one of the query parameters is the only way we | but since '?' is an allowed character in an HDL, '?' followed by one of the query parameters is the only way we | ||
| have to detect the query string so that it isn't URL-encoded with the rest of the identifier. | have to detect the query string so that it isn't URL-encoded with the rest of the identifier. | ||
| Line 637: | Line 643: | ||
| 	local access = options.access; | 	local access = options.access; | ||
| 	local handler = options.handler; | 	local handler = options.handler; | ||
| 	local query_params = {														-- list of known query parameters from  | 	local query_params = {														-- list of known query parameters from https://www.handle.net/proxy_servlet.html | ||
| 		'noredirect', | 		'noredirect', | ||
| 		'ignore_aliases', | 		'ignore_aliases', | ||
| Line 806: | Line 812: | ||
| Determines whether an ISMN string is valid.  Similar to ISBN-13, ISMN is 13 digits beginning 979-0-... and uses the | Determines whether an ISMN string is valid.  Similar to ISBN-13, ISMN is 13 digits beginning 979-0-... and uses the | ||
| same check digit calculations.  See  | same check digit calculations.  See https://www.ismn-international.org/download/Web_ISMN_Users_Manual_2008-6.pdf | ||
| section 2, pages 9–12. | section 2, pages 9–12. | ||
| Line 855: | Line 861: | ||
| like this: | like this: | ||
| 	|issn=0819 4327 gives: [ | 	|issn=0819 4327 gives: [https://www.worldcat.org/issn/0819 4327 0819 4327]	-- can't have spaces in an external link | ||
| This code now prevents that by inserting a hyphen at the ISSN midpoint.  It also validates the ISSN for length | This code now prevents that by inserting a hyphen at the ISSN midpoint.  It also validates the ISSN for length | ||
| Line 959: | Line 965: | ||
| Format LCCN link and do simple error checking.  LCCN is a character string 8-12 characters long. The length of | Format LCCN link and do simple error checking.  LCCN is a character string 8-12 characters long. The length of | ||
| the LCCN dictates the character type of the first 1-3 characters; the rightmost eight are always digits. | the LCCN dictates the character type of the first 1-3 characters; the rightmost eight are always digits. | ||
| https://oclc-research.github.io/infoURI-Frozen/info-uri.info/info:lccn/reg.html | |||
| length = 8 then all digits | length = 8 then all digits | ||
| Line 1,560: | Line 1,566: | ||
| 		options_t.handler = cfg.id_handlers[hkey]; | 		options_t.handler = cfg.id_handlers[hkey]; | ||
| 		options_t.coins_list_t = ID_list_coins_t;								-- pointer to ID_list_coins_t; for |asin= and |ol=; also to keep erroneous values out of the citation's metadata | 		options_t.coins_list_t = ID_list_coins_t;								-- pointer to ID_list_coins_t; for |asin= and |ol=; also to keep erroneous values out of the citation's metadata | ||
| 		options_t.coins_list_t[hkey] = v;										-- id value without accept-as-written markup for metadata | |||
| 		if options_t.handler.access and not in_array (options_t.handler.access, cfg.keywords_lists['id-access']) then | 		if options_t.handler.access and not in_array (options_t.handler.access, cfg.keywords_lists['id-access']) then | ||
| Line 1,618: | Line 1,625: | ||
| ]] | ]] | ||
| local function identifier_lists_get ( | local function identifier_lists_get (args_t, options_t, ID_support_t) | ||
| 	local ID_list_coins_t = extract_ids ( | 	local ID_list_coins_t = extract_ids (args_t);										-- get a table of identifiers and their values for use locally and for use in COinS | ||
| 	options_check (ID_list_coins_t, ID_support_t);										-- ID support parameters must have matching identifier parameters   | 	options_check (ID_list_coins_t, ID_support_t);										-- ID support parameters must have matching identifier parameters   | ||
| 	local ID_access_levels_t = extract_id_access_levels ( | 	local ID_access_levels_t = extract_id_access_levels (args_t, ID_list_coins_t);		-- get a table of identifier access levels | ||
| 	local ID_list_t = build_id_list (ID_list_coins_t, options_t, ID_access_levels_t);	-- get a sequence table of rendered identifier strings | 	local ID_list_t = build_id_list (ID_list_coins_t, options_t, ID_access_levels_t);	-- get a sequence table of rendered identifier strings | ||
| Line 1,655: | Line 1,662: | ||
| 	auto_link_urls = auto_link_urls,											-- table of identifier URLs to be used when auto-linking |title= | 	auto_link_urls = auto_link_urls,											-- table of identifier URLs to be used when auto-linking |title= | ||
| 	identifier_lists_get = identifier_lists_get,								-- experiment to replace individual calls to build_id_list, extract_ids, extract_id_access_levels | 	identifier_lists_get = identifier_lists_get,								-- experiment to replace individual calls to build_id_list(), extract_ids, extract_id_access_levels | ||
| 	is_embargoed = is_embargoed; | 	is_embargoed = is_embargoed; | ||
| 	set_selected_modules = set_selected_modules; | 	set_selected_modules = set_selected_modules; | ||
| 	} | 	} | ||