Module:Citation/CS1/Identifiers/sandbox: Difference between revisions

no edit summary
No edit summary
 
imported>Trappist the monk
No edit summary
 
(3 intermediate revisions by 2 users not shown)
Line 1: Line 1:
--[[
--[[
History of changes since last sync: 2021-04-10
History of changes since last sync: 2023-01-14


2021-05-21: add support for |ssrn-access=; see Help_talk:Citation_Style_1#ssrn
2023-02-20: maint cat for temp bibcodes; see Help_talk:Citation_Style_1#New_maintenance_category%3A_Category%3ACS1_maint%3A_bibcode
2021-07-28: reworked error messaging; see Help_talk:Citation_Style_1#error_messaging
2021-08-17: fix false positive doi error detection; see Help_talk:Citation_Style_1#bad_DOI_check


]]
]]
Line 189: Line 187:
This function does not work if it is fed month names for languages other than English.  Wikimedia #time: parser
This function does not work if it is fed month names for languages other than English.  Wikimedia #time: parser
apparently doesn't understand non-English date month names. This function will always return false when the date
apparently doesn't understand non-English date month names. This function will always return false when the date
contains a non-English month name because good1 is false after the call to lang.formatDate().  To get around that
contains a non-English month name because good1 is false after the call to lang_object.formatDate().  To get
call this function with YYYY-MM-DD format dates.
around that call this function with date parts and create a YYYY-MM-DD format date.


]=]
]=]


local function is_valid_biorxiv_date (biorxiv_date)
local function is_valid_biorxiv_date (y, m, d)
local biorxiv_date = table.concat ({y, m, d}, '-'); -- make ymd date
local good1, good2;
local good1, good2;
local biorxiv_ts, tomorrow_ts; -- to hold Unix timestamps representing the dates
local biorxiv_ts, tomorrow_ts; -- to hold Unix timestamps representing the dates
Line 259: Line 258:
--[[--------------------------< N O R M A L I Z E _ L C C N >--------------------------------------------------
--[[--------------------------< N O R M A L I Z E _ L C C N >--------------------------------------------------


LCCN normalization (http://www.loc.gov/marc/lccn-namespace.html#normalization)
LCCN normalization (https://www.loc.gov/marc/lccn-namespace.html#normalization)
1. Remove all blanks.
1. Remove all blanks.
2. If there is a forward slash (/) in the string, remove it, and remove all characters to the right of the forward slash.
2. If there is a forward slash (/) in the string, remove it, and remove all characters to the right of the forward slash.
Line 269: Line 268:


Returns a normalized LCCN for lccn() to validate.  There is no error checking (step 3.b.1) performed in this function.
Returns a normalized LCCN for lccn() to validate.  There is no error checking (step 3.b.1) performed in this function.
]]
]]


Line 295: Line 295:
--[[--------------------------< A R X I V >--------------------------------------------------------------------
--[[--------------------------< A R X I V >--------------------------------------------------------------------


See: http://arxiv.org/help/arxiv_identifier
See: https://arxiv.org/help/arxiv_identifier


format and error check arXiv identifier.  There are three valid forms of the identifier:
format and error check arXiv identifier.  There are three valid forms of the identifier:
Line 320: Line 320:
<date code> and <version> are as defined for 0704-1412
<date code> and <version> are as defined for 0704-1412
<number> is a five-digit number
<number> is a five-digit number
]]
]]


Line 388: Line 389:
Validates (sort of) and formats a bibcode ID.
Validates (sort of) and formats a bibcode ID.


Format for bibcodes is specified here: http://adsabs.harvard.edu/abs_doc/help_pages/data.html#bibcodes
Format for bibcodes is specified here: https://adsabs.harvard.edu/abs_doc/help_pages/data.html#bibcodes


But, this: 2015arXiv151206696F is apparently valid so apparently, the only things that really matter are length, 19 characters
But, this: 2015arXiv151206696F is apparently valid so apparently, the only things that really matter are length, 19 characters
Line 428: Line 429:
if id:find('&%.') then
if id:find('&%.') then
err_type = cfg.err_msg_supl.journal; -- journal abbreviation must not have '&.' (if it does it's missing a letter)
err_type = cfg.err_msg_supl.journal; -- journal abbreviation must not have '&.' (if it does it's missing a letter)
end
if id:match ('.........%.tmp%.') then -- temporary bibcodes when positions 10–14 are '.tmp.'
set_message ('maint_bibcode');
end
end
end
end
Line 473: Line 477:


if m then -- m is nil when id is the six-digit form
if m then -- m is nil when id is the six-digit form
if not is_valid_biorxiv_date (y .. '-' .. m .. '-' .. d) then -- validate the encoded date; TODO: don't ignore leap-year and actual month lengths ({{#time:}} is a poor date validator)
if not is_valid_biorxiv_date (y, m, d) then -- validate the encoded date; TODO: don't ignore leap-year and actual month lengths ({{#time:}} is a poor date validator)
break; -- date fail; break out early so we don't unset the error message
break; -- date fail; break out early so we don't unset the error message
end
end
Line 480: Line 484:
break; -- and done
break; -- and done
end
end
end -- err_cat remains set here when no match
end -- err_cat remains set here when no match


if err_msg then
if err_msg then
Line 498: Line 502:


The description of the structure of this identifier can be found at Help_talk:Citation_Style_1/Archive_26#CiteSeerX_id_structure
The description of the structure of this identifier can be found at Help_talk:Citation_Style_1/Archive_26#CiteSeerX_id_structure
]]
]]


Line 533: Line 538:
and terminal punctuation may not be technically correct but it appears, that in practice these characters are rarely
and terminal punctuation may not be technically correct but it appears, that in practice these characters are rarely
if ever used in DOI names.
if ever used in DOI names.
https://www.doi.org/doi_handbook/2_Numbering.html -- 2.2 Syntax of a DOI name
https://www.doi.org/doi_handbook/2_Numbering.html#2.2.2 -- 2.2.2 DOI prefix


]]
]]
Line 579: Line 587:
'^[^1-9]%d%d%d$', -- 4 digits without subcode (0xxx); accepts: 1000–9999
'^[^1-9]%d%d%d$', -- 4 digits without subcode (0xxx); accepts: 1000–9999
'^%d%d%d%d%d%d+', -- 6 or more digits
'^%d%d%d%d%d%d+', -- 6 or more digits
'^%d%d?%d?$', -- less than 4 digits without subcode (with subcode is legitimate)
'^%d%d?%d?$', -- less than 4 digits without subcode (3 digits with subcode is legitimate)
'^%d%d?%.[%d%.]+', -- 1 or 2 digits with subcode
'^5555$', -- test registrant will never resolve
'^5555$', -- test registrant will never resolve
'[^%d%.]', -- any character that isn't a digit or a dot
'[^%d%.]', -- any character that isn't a digit or a dot
Line 627: Line 636:
if ever used in HDLs.
if ever used in HDLs.


Query string parameters are named here: http://www.handle.net/proxy_servlet.html.  query strings are not displayed
Query string parameters are named here: https://www.handle.net/proxy_servlet.html.  query strings are not displayed
but since '?' is an allowed character in an HDL, '?' followed by one of the query parameters is the only way we
but since '?' is an allowed character in an HDL, '?' followed by one of the query parameters is the only way we
have to detect the query string so that it isn't URL-encoded with the rest of the identifier.
have to detect the query string so that it isn't URL-encoded with the rest of the identifier.
Line 637: Line 646:
local access = options.access;
local access = options.access;
local handler = options.handler;
local handler = options.handler;
local query_params = { -- list of known query parameters from http://www.handle.net/proxy_servlet.html
local query_params = { -- list of known query parameters from https://www.handle.net/proxy_servlet.html
'noredirect',
'noredirect',
'ignore_aliases',
'ignore_aliases',
Line 806: Line 815:


Determines whether an ISMN string is valid.  Similar to ISBN-13, ISMN is 13 digits beginning 979-0-... and uses the
Determines whether an ISMN string is valid.  Similar to ISBN-13, ISMN is 13 digits beginning 979-0-... and uses the
same check digit calculations.  See http://www.ismn-international.org/download/Web_ISMN_Users_Manual_2008-6.pdf
same check digit calculations.  See https://www.ismn-international.org/download/Web_ISMN_Users_Manual_2008-6.pdf
section 2, pages 9–12.
section 2, pages 9–12.


Line 855: Line 864:
like this:
like this:


|issn=0819 4327 gives: [http://www.worldcat.org/issn/0819 4327 0819 4327] -- can't have spaces in an external link
|issn=0819 4327 gives: [https://www.worldcat.org/issn/0819 4327 0819 4327] -- can't have spaces in an external link
This code now prevents that by inserting a hyphen at the ISSN midpoint.  It also validates the ISSN for length
This code now prevents that by inserting a hyphen at the ISSN midpoint.  It also validates the ISSN for length
Line 959: Line 968:
Format LCCN link and do simple error checking.  LCCN is a character string 8-12 characters long. The length of
Format LCCN link and do simple error checking.  LCCN is a character string 8-12 characters long. The length of
the LCCN dictates the character type of the first 1-3 characters; the rightmost eight are always digits.
the LCCN dictates the character type of the first 1-3 characters; the rightmost eight are always digits.
http://info-uri.info/registry/OAIHandler?verb=GetRecord&metadataPrefix=reg&identifier=info:lccn/
https://oclc-research.github.io/infoURI-Frozen/info-uri.info/info:lccn/reg.html


length = 8 then all digits
length = 8 then all digits
Line 1,560: Line 1,569:
options_t.handler = cfg.id_handlers[hkey];
options_t.handler = cfg.id_handlers[hkey];
options_t.coins_list_t = ID_list_coins_t; -- pointer to ID_list_coins_t; for |asin= and |ol=; also to keep erroneous values out of the citation's metadata
options_t.coins_list_t = ID_list_coins_t; -- pointer to ID_list_coins_t; for |asin= and |ol=; also to keep erroneous values out of the citation's metadata
options_t.coins_list_t[hkey] = v; -- id value without accept-as-written markup for metadata
if options_t.handler.access and not in_array (options_t.handler.access, cfg.keywords_lists['id-access']) then
if options_t.handler.access and not in_array (options_t.handler.access, cfg.keywords_lists['id-access']) then
Line 1,618: Line 1,628:
]]
]]


local function identifier_lists_get (args, options_t, ID_support_t)
local function identifier_lists_get (args_t, options_t, ID_support_t)
local ID_list_coins_t = extract_ids (args); -- get a table of identifiers and their values for use locally and for use in COinS
local ID_list_coins_t = extract_ids (args_t); -- get a table of identifiers and their values for use locally and for use in COinS
options_check (ID_list_coins_t, ID_support_t); -- ID support parameters must have matching identifier parameters  
options_check (ID_list_coins_t, ID_support_t); -- ID support parameters must have matching identifier parameters  
local ID_access_levels_t = extract_id_access_levels (args, ID_list_coins_t); -- get a table of identifier access levels
local ID_access_levels_t = extract_id_access_levels (args_t, ID_list_coins_t); -- get a table of identifier access levels
local ID_list_t = build_id_list (ID_list_coins_t, options_t, ID_access_levels_t); -- get a sequence table of rendered identifier strings
local ID_list_t = build_id_list (ID_list_coins_t, options_t, ID_access_levels_t); -- get a sequence table of rendered identifier strings


Line 1,655: Line 1,665:
auto_link_urls = auto_link_urls, -- table of identifier URLs to be used when auto-linking |title=
auto_link_urls = auto_link_urls, -- table of identifier URLs to be used when auto-linking |title=
identifier_lists_get = identifier_lists_get, -- experiment to replace individual calls to build_id_list, extract_ids, extract_id_access_levels
identifier_lists_get = identifier_lists_get, -- experiment to replace individual calls to build_id_list(), extract_ids, extract_id_access_levels
is_embargoed = is_embargoed;
is_embargoed = is_embargoed;
set_selected_modules = set_selected_modules;
set_selected_modules = set_selected_modules;
}
}