data test;
text = "Phone: 555-123-4567";
/* Find phone number pattern */
pos = prxmatch('/\d{3}-\d{3}-\d{4}/', text);
put pos=; /* Output: pos=8 */
run;data clean_data;
input name $50.;
/* Remove extra spaces */
clean_name = prxchange('s/\s+/ /g', -1, strip(name));
datalines;
John Smith
Mary Jane Doe
;
/* Output:
John Smith
Mary Jane Doe */
run;data _null_;
retain pattern_id;
if _n_ = 1 then pattern_id = prxparse('/\b\w+@\w+\.\w+\b/');
text = "Contact: john.doe@company.com for info";
pos = prxmatch(pattern_id, text);
if pos > 0 then do;
email = prxposn(pattern_id, 0, text);
put email=; /* Output: email=john.doe@company.com */
end;
run;data products;
input description $100.;
/* Extract product code pattern: ABC-1234 */
pattern_id = prxparse('/[A-Z]{3}-\d{4}/');
pos = prxmatch(pattern_id, description);
if pos > 0 then
product_code = prxposn(pattern_id, 0, description);
else
product_code = 'MISSING';
datalines;
Item ABC-1234 Blue Widget $29.99
Special offer DEF-5678 Red Gadget
No code available for this item
Widget GHI-9012 Green Version 2.0
;
/* Output:
ABC-1234
DEF-5678
MISSING
GHI-9012 */
run;data phone_cleanup;
input raw_phone $20.;
/* Remove non-digits */
digits_only = prxchange('s/\D//g', -1, raw_phone);
/* Format as (XXX) XXX-XXXX if 10 digits */
if length(digits_only) = 10 then do;
area_code = substr(digits_only, 1, 3);
exchange = substr(digits_only, 4, 3);
number = substr(digits_only, 7, 4);
formatted_phone = cats('(', area_code, ') ', exchange, '-', number);
end;
else formatted_phone = 'INVALID';
datalines;
555-123-4567
(555) 987-6543
555.111.2222
5551234567
555-12-345
;
/* Output:
(555) 123-4567
(555) 987-6543
(555) 111-2222
(555) 123-4567
INVALID */
run;data email_extract;
input text $200.;
email_pattern = prxparse('/\b[\w.-]+@[\w.-]+\.\w+\b/');
/* Find all emails in text */
start = 1;
email_count = 0;
do while(prxmatch(email_pattern, substr(text, start)) > 0);
pos = prxmatch(email_pattern, substr(text, start));
email_count + 1;
found_email = prxposn(email_pattern, 0, substr(text, start));
/* Create variables email1, email2, etc. */
call symputx(cats('email', email_count), found_email);
start = start + pos + length(found_email) - 1;
end;
datalines;
Contact john@company.com or mary.smith@corp.net for details
Send reports to admin@system.org and backup@system.org
No email addresses in this text
;
run;data transaction_parse;
input transaction $200.;
/* Pattern for: DATE AMOUNT DESCRIPTION */
pattern = prxparse('/(\d{2}\/\d{2}\/\d{4})\s+\$?(\d+\.\d{2})\s+(.+)/');
if prxmatch(pattern, transaction) then do;
trans_date = prxposn(pattern, 1, transaction);
amount = input(prxposn(pattern, 2, transaction), 8.2);
description = strip(prxposn(pattern, 3, transaction));
parsed = 'YES';
end;
else parsed = 'NO';
datalines;
12/15/2023 $125.50 Grocery Store Purchase
01/03/2024 45.00 Gas Station
Invalid transaction format
11/30/2023 $1,250.75 Monthly Rent Payment
;
/* Output:
trans_date=12/15/2023, amount=125.5, description=Grocery Store Purchase
trans_date=01/03/2024, amount=45, description=Gas Station
parsed=NO
(Note: Won't parse $1,250.75 due to comma) */
run;data name_parse;
input full_name $50.;
/* Pattern for: First [Middle] Last [Suffix] */
name_pattern = prxparse('/^(\w+)\s+(?:(\w+)\s+)?(\w+)(?:\s+(Jr\.?|Sr\.?|III?|IV))?$/i');
if prxmatch(name_pattern, strip(full_name)) then do;
first_name = prxposn(name_pattern, 1, full_name);
middle_name = prxposn(name_pattern, 2, full_name);
last_name = prxposn(name_pattern, 3, full_name);
suffix = prxposn(name_pattern, 4, full_name);
if missing(middle_name) then middle_name = '';
if missing(suffix) then suffix = '';
end;
datalines;
John Smith
Mary Jane Watson
Robert Smith Jr.
Elizabeth Anne Johnson III
;
run;data cc_validation;
input cc_number $20.;
/* Remove spaces and dashes */
clean_cc = prxchange('s/[\s-]//g', -1, cc_number);
/* Validate patterns */
visa_pattern = prxparse('/^4\d{15}$/');
mc_pattern = prxparse('/^5[1-5]\d{14}$/');
amex_pattern = prxparse('/^3[47]\d{13}$/');
if prxmatch(visa_pattern, clean_cc) then cc_type = 'VISA';
else if prxmatch(mc_pattern, clean_cc) then cc_type = 'MASTERCARD';
else if prxmatch(amex_pattern, clean_cc) then cc_type = 'AMEX';
else cc_type = 'INVALID';
datalines;
4111-1111-1111-1111
5555 5555 5555 4444
3782 822463 10005
1234567890123456
;
/* Output:
VISA
MASTERCARD
AMEX
INVALID */
run;data ssn_validation;
input ssn $15.;
/* Valid SSN pattern: XXX-XX-XXXX */
ssn_pattern = prxparse('/^\d{3}-\d{2}-\d{4}$/');
if prxmatch(ssn_pattern, ssn) then do;
valid_format = 'YES';
/* Extract parts */
area = substr(ssn, 1, 3);
group = substr(ssn, 5, 2);
serial = substr(ssn, 8, 4);
/* Basic validation rules */
if area in ('000', '666') or area >= '900' then valid_ssn = 'NO';
else if group = '00' then valid_ssn = 'NO';
else if serial = '0000' then valid_ssn = 'NO';
else valid_ssn = 'YES';
end;
else do;
valid_format = 'NO';
valid_ssn = 'NO';
end;
datalines;
123-45-6789
000-12-3456
123-00-4567
123-45-0000
12345-6789
;
run;data log_analysis;
input log_line $300.;
/* Common Log Format pattern */
log_pattern = prxparse('/^(\S+)\s+\S+\s+\S+\s+\[([^\]]+)\]\s+"(\w+)\s+(\S+)\s+\S+"\s+(\d+)\s+(\d+|-)/');
if prxmatch(log_pattern, log_line) then do;
ip_address = prxposn(log_pattern, 1, log_line);
timestamp = prxposn(log_pattern, 2, log_line);
method = prxposn(log_pattern, 3, log_line);
url = prxposn(log_pattern, 4, log_line);
status_code = input(prxposn(log_pattern, 5, log_line), 8.);
bytes_sent = prxposn(log_pattern, 6, log_line);
/* Categorize status codes */
if status_code < 300 then status_category = 'SUCCESS';
else if status_code < 400 then status_category = 'REDIRECT';
else if status_code < 500 then status_category = 'CLIENT_ERROR';
else status_category = 'SERVER_ERROR';
end;
datalines;
192.168.1.1 - - [25/Dec/2023:10:15:32 +0000] "GET /index.html HTTP/1.1" 200 1234
10.0.0.5 - - [25/Dec/2023:10:16:45 +0000] "POST /api/data HTTP/1.1" 404 512
;
run;data large_dataset;
/* Simulate large dataset */
do i = 1 to 1000000;
text = cats('Record ', i, ' with email user', i, '@domain.com');
output;
end;
run;
data extract_emails;
set large_dataset;
/* Compile pattern once, retain across observations */
retain email_pattern;
if _n_ = 1 then email_pattern = prxparse('/\b[\w.-]+@[\w.-]+\.\w+\b/');
if prxmatch(email_pattern, text) then
email = prxposn(email_pattern, 0, text);
drop email_pattern;
run;proc sql;
create table phone_numbers as
select name, phone,
case
when prxmatch('/^\(\d{3}\)\s?\d{3}-\d{4}$/', strip(phone)) then 'VALID'
when prxmatch('/^\d{3}-\d{3}-\d{4}$/', strip(phone)) then 'VALID'
when prxmatch('/^\d{10}$/', compress(phone)) then 'NEEDS_FORMAT'
else 'INVALID'
end as phone_status
from contact_list
where prxmatch('/\d/', phone) > 0; /* Only records with digits */
quit;data financial_extract;
input report_text $500.;
/* Pattern for currency amounts */
currency_pattern = prxparse('/\$[\d,]+\.?\d*/');
/* Pattern for percentages */
percent_pattern = prxparse('/\d+\.?\d*%/');
/* Extract all currency amounts */
start = 1;
amount_count = 0;
total_amount = 0;
do while(prxmatch(currency_pattern, substr(report_text, start)) > 0);
pos = prxmatch(currency_pattern, substr(report_text, start));
amount_count + 1;
found_amount = prxposn(currency_pattern, 0, substr(report_text, start));
/* Convert to numeric */
numeric_amount = input(compress(found_amount, '$,'), 12.2);
total_amount + numeric_amount;
start = start + pos + length(found_amount) - 1;
end;
/* Extract percentage */
if prxmatch(percent_pattern, report_text) then
percentage = prxposn(percent_pattern, 0, report_text);
datalines;
Revenue increased by $125,000 with expenses of $45,500 showing 15.5% growth
Total sales: $2,500,000 profit margin improved to 8.2%
;
run;data masked_data;
input sensitive_info $200.;
/* Mask SSN: XXX-XX-1234 -> XXX-XX-**** */
masked_ssn = prxchange('s/(\d{3}-\d{2}-)\d{4}/$1****/g', -1, sensitive_info);
/* Mask credit cards: show only last 4 digits */
masked_cc = prxchange('s/\b(\d{4}[\s-]?)(\d{4}[\s-]?)(\d{4}[\s-]?)(\d{4})\b/****-****-****-$4/g', -1, masked_ssn);
/* Mask email domains */
final_masked = prxchange('s/(@)[\w.-]+(\.\w+)/$1****$2/g', -1, masked_cc);
datalines;
Customer SSN: 123-45-6789 CC: 4111-1111-1111-1111 Email: john@company.com
Account 987-65-4321 Card 5555-5555-5555-4444 Contact mary@corp.net
;
run;%macro test_pattern(pattern, test_string);
data _null_;
pattern_id = prxparse("&pattern");
if pattern_id > 0 then do;
result = prxmatch(pattern_id, "&test_string");
if result > 0 then
put "MATCH: &test_string at position " result;
else
put "NO MATCH: &test_string";
end;
else put "ERROR: Invalid pattern &pattern";
run;
%mend;
/* Test different patterns */
%test_pattern('/\d{3}-\d{2}-\d{4}/', 123-45-6789);
%test_pattern('/\b\w+@\w+\.\w+\b/', john@company.com);
%test_pattern('/[invalid/', test); /* Invalid pattern */data regex_pitfalls;
input test_data $50.;
/* WRONG: Escaping issues */
/* wrong_pattern = '/\d+\.\d+/'; /* Single backslash */
/* CORRECT: Double backslash in SAS */
correct_pattern = prxparse('/\\d+\\.\\d+/');
/* WRONG: Not anchoring when needed */
loose_match = prxmatch('/\\d{3}/', test_data); /* Matches 123 in 1234567 */
/* CORRECT: Anchored match */
exact_match = prxmatch('/^\\d{3}$/', test_data); /* Matches exactly 3 digits */
datalines;
123
1234
abc123def
;
run;data ae_clean;
input usubjid $20. aeterm $200.;
/* Standardize AE terms - remove extra spaces, fix case */
clean_term = propcase(prxchange('s/\s+/ /g', -1, strip(aeterm)));
/* Extract severity from AE term */
severity_pattern = prxparse('/\b(mild|moderate|severe|serious)\b/i');
if prxmatch(severity_pattern, aeterm) then do;
aesev = upcase(prxposn(severity_pattern, 0, aeterm));
/* Remove severity from term */
clean_aeterm = prxchange('s/\b(mild|moderate|severe|serious)\b//i', -1, clean_term);
clean_aeterm = prxchange('s/\s+/ /g', -1, strip(clean_aeterm));
end;
else do;
aesev = '';
clean_aeterm = clean_term;
end;
/* Extract body system from structured terms */
body_sys_pattern = prxparse('/^([^:]+):\s*(.+)/');
if prxmatch(body_sys_pattern, clean_aeterm) then do;
aebodsys = strip(prxposn(body_sys_pattern, 1, clean_aeterm));
aedecod = strip(prxposn(body_sys_pattern, 2, clean_aeterm));
end;
datalines;
STUDY001-001 HEADACHE SEVERE
STUDY001-002 Gastrointestinal: Nausea mild
STUDY001-003 fatigue moderate
STUDY001-004 Respiratory: shortness of breath
STUDY001-005 RASH SERIOUS
;
/* Output:
STUDY001-001: aesev=SEVERE, clean_aeterm=Headache
STUDY001-002: aebodsys=Gastrointestinal, aedecod=Nausea
STUDY001-003: aesev=MODERATE, clean_aeterm=Fatigue */
run;data lb_parse;
input usubjid $20. lbtest $30. lborres $20. lbornrlo $20. lbornrhi $20.;
/* Parse reference ranges like "5.0-10.5", "<=5.0", ">=2.1" */
range_pattern = prxparse('/^([<>=]*)\s*(\d+\.?\d*)\s*-?\s*(\d+\.?\d*)?/');
/* Parse LBORRES for numeric extraction */
result_pattern = prxparse('/([<>=]*)\s*(\d+\.?\d*)/');
/* Extract numeric result */
if prxmatch(result_pattern, lborres) then do;
lbstresc = prxposn(result_pattern, 2, lborres);
lbstresn = input(lbstresc, 8.2);
/* Extract comparison operator */
operator = prxposn(result_pattern, 1, lborres);
if operator ne '' then lbstresc = cats(operator, lbstresn);
end;
/* Parse reference range low */
if prxmatch(range_pattern, lbornrlo) then
lbstnrlo = input(prxposn(range_pattern, 2, lbornrlo), 8.2);
/* Parse reference range high */
if prxmatch(range_pattern, lbornrhi) then
lbstnrhi = input(prxposn(range_pattern, 2, lbornrhi), 8.2);
/* Determine abnormal flags */
if lbstresn ne . then do;
if lbstnrlo ne . and lbstresn < lbstnrlo then lbnrind = 'LOW';
else if lbstnrhi ne . and lbstresn > lbstnrhi then lbnrind = 'HIGH';
else if lbstnrlo ne . and lbstnrhi ne . then lbnrind = 'NORMAL';
else lbnrind = '';
end;
datalines;
STUDY001-001 Hemoglobin 12.5 10.0 16.0
STUDY001-002 Glucose <5.0 3.5 7.0
STUDY001-003 Creatinine >=2.5 0.5 1.2
STUDY001-004 ALT 45 <=40 <=40
;
/* Output:
STUDY001-001: lbstresn=12.5, lbnrind=NORMAL
STUDY001-002: lbstresn=5.0, lbnrind=HIGH, lbstresc=<5.0
STUDY001-003: lbstresn=2.5, lbnrind=HIGH, lbstresc=>=2.5 */
run;data cm_standardize;
input usubjid $20. cmtrt $100.;
/* Extract drug name and strength */
drug_pattern = prxparse('/^([^0-9]+?)\s*(\d+\.?\d*\s*mg|mcg|g|ml|units?)\s*(.*)/i');
if prxmatch(drug_pattern, strip(cmtrt)) then do;
cmdecod = strip(propcase(prxposn(drug_pattern, 1, cmtrt)));
cmdose = prxposn(drug_pattern, 2, cmtrt);
cmroute = strip(upcase(prxposn(drug_pattern, 3, cmtrt)));
/* Clean drug name - remove common suffixes */
cmdecod = prxchange('s/\s+(tablet|capsule|injection|cream|gel)$//i', -1, cmdecod);
/* Extract numeric dose */
dose_num_pattern = prxparse('/(\d+\.?\d*)/');
if prxmatch(dose_num_pattern, cmdose) then
cmdostxt = prxposn(dose_num_pattern, 0, cmdose);
/* Extract dose unit */
unit_pattern = prxparse('/(mg|mcg|g|ml|units?)/i');
if prxmatch(unit_pattern, cmdose) then
cmdosu = upcase(prxposn(unit_pattern, 0, cmdose));
end;
else do;
cmdecod = propcase(strip(cmtrt));
cmdose = '';
cmroute = '';
end;
/* Standardize route of administration */
if index(upcase(cmroute), 'ORAL') or index(upcase(cmroute), 'PO') then cmroute = 'ORAL';
else if index(upcase(cmroute), 'IV') or index(upcase(cmroute), 'INTRAVENOUS') then cmroute = 'INTRAVENOUS';
else if index(upcase(cmroute), 'IM') or index(upcase(cmroute), 'INTRAMUSCULAR') then cmroute = 'INTRAMUSCULAR';
else if index(upcase(cmroute), 'TOPICAL') then cmroute = 'TOPICAL';
datalines;
STUDY001-001 Acetaminophen 500mg tablet oral
STUDY001-002 lisinopril 10 mg PO daily
STUDY001-003 Insulin 10 units subcutaneous
STUDY001-004 IBUPROFEN 200MG CAPSULE
STUDY001-005 Hydrocortisone 1% cream topical
;
/* Output:
STUDY001-001: cmdecod=Acetaminophen, cmdostxt=500, cmdosu=MG, cmroute=ORAL
STUDY001-002: cmdecod=Lisinopril, cmdostxt=10, cmdosu=MG, cmroute=ORAL
STUDY001-003: cmdecod=Insulin, cmdostxt=10, cmdosu=UNITS, cmroute=SUBCUTANEOUS */
run;data dm_validation;
input usubjid $20.;
/* Standard format: STUDY###-###-### */
subjid_pattern = prxparse('/^([A-Z]+\d{3})-(\d{3})-(\d{3})$/');
if prxmatch(subjid_pattern, usubjid) then do;
studyid = prxposn(subjid_pattern, 1, usubjid);
siteid = prxposn(subjid_pattern, 2, usubjid);
subjid_num = prxposn(subjid_pattern, 3, usubjid);
format_valid = 'Y';
/* Check for sequential numbering issues */
if input(subjid_num, 8.) = . then seq_valid = 'N';
else seq_valid = 'Y';
end;
else do;
format_valid = 'N';
seq_valid = 'N';
/* Try to extract study ID from malformed USUBJID */
study_extract = prxchange('s/^([A-Z]+\d{3}).*/$1/', -1, usubjid);
if study_extract ne usubjid then studyid = study_extract;
end;
datalines;
STUDY001-001-001
STUDY001-001-002
INVALID-SUBJID
STUDY002-999-ABC
STUDY001-001-1
;
/* Output:
STUDY001-001-001: format_valid=Y, studyid=STUDY001, siteid=001
INVALID-SUBJID: format_valid=N
STUDY002-999-ABC: format_valid=N, seq_valid=N */
run;data visit_validation;
input usubjid $20. visitnum 8. visit $30. svstdtc $20.;
/* Extract visit day from visit description */
day_pattern = prxparse('/Day\s+(\d+)/i');
week_pattern = prxparse('/Week\s+(\d+)/i');
month_pattern = prxparse('/Month\s+(\d+)/i');
if prxmatch(day_pattern, visit) then do;
visit_day = input(prxposn(day_pattern, 1, visit), 8.);
visit_type = 'DAY';
end;
else if prxmatch(week_pattern, visit) then do;
visit_day = input(prxposn(week_pattern, 1, visit), 8.) * 7;
visit_type = 'WEEK';
end;
else if prxmatch(month_pattern, visit) then do;
visit_day = input(prxposn(month_pattern, 1, visit), 8.) * 30;
visit_type = 'MONTH';
end;
else if upcase(visit) = 'SCREENING' then do;
visit_day = -14;
visit_type = 'SCREENING';
end;
else if upcase(visit) = 'BASELINE' then do;
visit_day = 0;
visit_type = 'BASELINE';
end;
/* Validate visit numbering sequence */
if visitnum ne . and visit_day ne . then do;
if visitnum = 1 and visit_day ne -14 then visit_seq_issue = 'Y';
else if visitnum = 2 and visit_day ne 0 then visit_seq_issue = 'Y';
else visit_seq_issue = 'N';
end;
datalines;
STUDY001-001-001 1 Screening 2023-01-15
STUDY001-001-001 2 Baseline 2023-01-29
STUDY001-001-001 3 Day 7 2023-02-05
STUDY001-001-001 4 Week 4 2023-02-26
STUDY001-001-001 5 Month 3 2023-04-29
;
/* Output:
Visit 1: visit_day=-14, visit_type=SCREENING
Visit 3: visit_day=7, visit_type=DAY
Visit 4: visit_day=28, visit_type=WEEK */
run;data mh_icd10;
input usubjid $20. mhterm $100. mhdecod $10.;
/* ICD-10 pattern: Letter followed by 2 digits, optional decimal and 1-2 more digits */
icd10_pattern = prxparse('/^[A-Z]\d{2}(\.?\d{1,2})?$/');
/* ICD-10-CM pattern (US specific) */
icd10cm_pattern = prxparse('/^[A-Z]\d{2}\.[A-Z0-9]{1,4}$/');
if prxmatch(icd10_pattern, mhdecod) then do;
icd10_valid = 'Y';
/* Extract category */
category = substr(mhdecod, 1, 1);
/* Categorize by ICD-10 chapter */
select(category);
when('A','B') icd10_chapter = 'Infectious diseases';
when('C','D') icd10_chapter = 'Neoplasms/Blood disorders';
when('E') icd10_chapter = 'Endocrine/Metabolic';
when('F') icd10_chapter = 'Mental/Behavioral';
when('G') icd10_chapter = 'Nervous system';
when('H') icd10_chapter = 'Eye/Ear';
when('I') icd10_chapter = 'Circulatory';
when('J') icd10_chapter = 'Respiratory';
when('K') icd10_chapter = 'Digestive';
when('L') icd10_chapter = 'Skin';
when('M') icd10_chapter = 'Musculoskeletal';
when('N') icd10_chapter = 'Genitourinary';
otherwise icd10_chapter = 'Other';
end;
end;
else do;
icd10_valid = 'N';
icd10_chapter = '';
end;
datalines;
STUDY001-001-001 Hypertension I10
STUDY001-001-001 Type 2 Diabetes E11.9
STUDY001-001-001 Depression F32.1
STUDY001-001-001 Asthma J45
STUDY001-001-001 Back Pain INVALID
;
/* Output:
I10: icd10_valid=Y, icd10_chapter=Circulatory
E11.9: icd10_valid=Y, icd10_chapter=Endocrine/Metabolic
INVALID: icd10_valid=N */
run;data sae_analysis;
input usubjid $20. aeterm $50. narrative $500.;
/* Extract timeline information */
timeline_pattern = prxparse('/(\d+)\s+(day|week|month|hour)s?\s+(after|before|since)/i');
if prxmatch(timeline_pattern, narrative) then do;
time_value = input(prxposn(timeline_pattern, 1, narrative), 8.);
time_unit = upcase(prxposn(timeline_pattern, 2, narrative));
time_relation = upcase(prxposn(timeline_pattern, 3, narrative));
/* Convert to days */
select(time_unit);
when('HOUR') timeline_days = time_value / 24;
when('DAY') timeline_days = time_value;
when('WEEK') timeline_days = time_value * 7;
when('MONTH') timeline_days = time_value * 30;
end;
end;
/* Check for hospitalization */
hosp_pattern = prxparse('/hospital|admit|emergency|ER|ICU/i');
hospitalized = ifc(prxmatch(hosp_pattern, narrative) > 0, 'Y', 'N');
/* Extract outcome information */
outcome_pattern = prxparse('/\b(resolved|recovering|ongoing|fatal|death)\b/i');
if prxmatch(outcome_pattern, narrative) then
outcome = upcase(prxposn(outcome_pattern, 0, narrative));
/* Drug relationship assessment */
relation_pattern = prxparse('/\b(related|unrelated|possibly related|probably related)\b/i');
if prxmatch(relation_pattern, narrative) then
causality = propcase(prxposn(relation_pattern, 0, narrative));
datalines;
STUDY001-001-001 Chest Pain Patient developed chest pain 3 days after starting study drug. Admitted to hospital for evaluation. Event resolved after 5 days and was assessed as possibly related to study medication.
STUDY001-002-002 Syncope Subject experienced syncope 2 weeks since last dose. Taken to emergency room. Currently recovering. Unrelated to study drug.
;
/* Output:
001-001: time_value=3, time_unit=DAY, hospitalized=Y, outcome=RESOLVED, causality=Possibly Related
002-002: time_value=2, time_unit=WEEK, hospitalized=Y, outcome=RECOVERING, causality=Unrelated */
run;data lab_integration;
input raw_data $300.;
/* Parse pipe-delimited lab results */
/* Format: SubjectID|TestName|Result|Units|RefLow|RefHigh|Date */
lab_pattern = prxparse('/^([^|]+)\|([^|]+)\|([^|]+)\|([^|]+)\|([^|]*)\|([^|]*)\|([^|]+)$/');
if prxmatch(lab_pattern, raw_data) then do;
ext_subjid = strip(prxposn(lab_pattern, 1, raw_data));
ext_testname = strip(prxposn(lab_pattern, 2, raw_data));
ext_result = strip(prxposn(lab_pattern, 3, raw_data));
ext_units = strip(prxposn(lab_pattern, 4, raw_data));
ext_reflow = strip(prxposn(lab_pattern, 5, raw_data));
ext_refhigh = strip(prxposn(lab_pattern, 6, raw_data));
ext_date = strip(prxposn(lab_pattern, 7, raw_data));
/* Map external subject ID to study format */
if prxmatch('/^\d{4}$/', ext_subjid) then
usubjid = cats('STUDY001-001-', put(input(ext_subjid, 8.), z3.));
/* Standardize test names */
select(upcase(ext_testname));
when('HGB', 'HEMOGLOBIN') lbtest = 'Hemoglobin';
when('HCT', 'HEMATOCRIT') lbtest = 'Hematocrit';
when('WBC', 'WHITE BLOOD CELLS') lbtest = 'White Blood Cell Count';
when('GLUC', 'GLUCOSE') lbtest = 'Glucose';
when('CREAT', 'CREATININE') lbtest = 'Creatinine';
otherwise lbtest = propcase(ext_testname);
end;
parsed_successfully = 'Y';
end;
else parsed_successfully = 'N';
datalines;
1001|HGB|12.5|g/dL|12.0|16.0|2023-03-15
1002|GLUC|95|mg/dL|70|100|2023-03-15
1003|Invalid format missing pipes
1004|WBC|7.2|K/uL|4.0|11.0|2023-03-16
;
/* Output:
1001: usubjid=STUDY001-001-001, lbtest=Hemoglobin, parsed_successfully=Y
1003: parsed_successfully=N */
run;data sdtm_comments;
input domain $2. variable $8. comment $200.;
/* Check for required comment patterns for regulatory submission */
/* Missing value explanation */
missing_pattern = prxparse('/\bNOT\s+(DONE|PERFORMED|COLLECTED|AVAILABLE)\b/i');
/* Imputation flag */
impute_pattern = prxparse('/\b(IMPUTED|ESTIMATED|DERIVED)\b/i');
/* Protocol deviation */
deviation_pattern = prxparse('/\b(DEVIATION|VIOLATION|OUT\s+OF\s+WINDOW)\b/i');
/* Data query resolution */
query_pattern = prxparse('/\b(QUERY|CLARIFICATION|CORRECTION)\b/i');
comment_type = '';
if prxmatch(missing_pattern, comment) then comment_type = 'MISSING_DATA';
else if prxmatch(impute_pattern, comment) then comment_type = 'IMPUTATION';
else if prxmatch(deviation_pattern, comment) then comment_type = 'DEVIATION';
else if prxmatch(query_pattern, comment) then comment_type = 'QUERY_RESOLUTION';
else comment_type = 'OTHER';
/* Check comment length (regulatory requirement) */
comment_length = length(strip(comment));
if comment_length > 200 then comment_length_flag = 'EXCEEDS_LIMIT';
else comment_length_flag = 'OK';
datalines;
VS VSORRES Visit not performed due to subject illness
LB LBORRES Lab sample not collected - subject missed visit
EG EGORRES ECG reading imputed from previous visit
AE AESTDTC Date estimated based on subject diary
CM CMSTDTC Medication start date query resolved with site
;
/* Output:
VS: comment_type=MISSING_DATA
LB: comment_type=MISSING_DATA
EG: comment_type=IMPUTATION
AE: comment_type=IMPUTATION */
run;/* Email: */ /\b[\w.-]+@[\w.-]+\.\w+\b/
/* Phone: */ /\d{3}-\d{3}-\d{4}/
/* SSN: */ /\d{3}-\d{2}-\d{4}/
/* Credit Card: */ /\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}/
/* IP Address: */ /\b(?:\d{1,3}\.){3}\d{1,3}\b/
/* URL: */ /https?:\/\/[\w.-]+\.\w+/
/* Currency: */ /\$[\d,]+\.?\d*/
/* Date MM/DD/YYYY: */ /\d{1,2}\/\d{1,2}\/\d{4}/
/* Alphanumeric: */ /^[A-Za-z0-9]+$/
/* Remove spaces: */ s/\s+/ /g/* Social Security Numbers */
/^\d{3}-\d{2}-\d{4}$/ /* XXX-XX-XXXX */
/^\d{3}\s\d{2}\s\d{4}$/ /* XXX XX XXXX */
/^\d{9}$/ /* XXXXXXXXX */
/* Driver's License (US States) */
/^[A-Z]\d{7}$/ /* CA format: A1234567 */
/^[A-Z]{2}\d{6}$/ /* NY format: AB123456 */
/^\d{8}$/ /* FL format: 12345678 */
/^[A-Z]\d{8}$/ /* TX format: A12345678 */
/* Passport Numbers */
/^[A-Z]\d{8}$/ /* US: A12345678 */
/^[A-Z]{2}\d{7}$/ /* UK: AB1234567 */
/^\d{2}[A-Z]{2}\d{5}$/ /* German: 12AB12345 */
/* Tax ID Numbers */
/^\d{2}-\d{7}$/ /* EIN: XX-XXXXXXX */
/^\d{3}-\d{2}-\d{4}$/ /* ITIN: XXX-XX-XXXX *//* Phone Numbers - International */
/^\+1\s?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$/ /* US: +1 (555) 123-4567 */
/^\+44\s?\d{4}\s?\d{6}$/ /* UK: +44 1234 567890 */
/^\+49\s?\d{3}\s?\d{8}$/ /* Germany: +49 123 12345678 */
/^\+33\s?\d{1}\s?\d{8}$/ /* France: +33 1 12345678 */
/^\+86\s?\d{3}\s?\d{8}$/ /* China: +86 138 12345678 */
/* Phone Numbers - Domestic Formats */
/^\(\d{3}\)\s?\d{3}-\d{4}$/ /* (555) 123-4567 */
/^\d{3}-\d{3}-\d{4}$/ /* 555-123-4567 */
/^\d{3}\.\d{3}\.\d{4}$/ /* 555.123.4567 */
/^\d{10}$/ /* 5551234567 */
/^1-\d{3}-\d{3}-\d{4}$/ /* 1-555-123-4567 */
/* Email Addresses - Basic to Complex */
/^\w+@\w+\.\w+$/ /* Basic: user@domain.com */
/^[\w.-]+@[\w.-]+\.\w{2,}$/ /* Standard: user.name@sub.domain.com */
/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/ /* RFC compliant */
/^[\w._%+-]+@[\w.-]+\.\w{2,}(?:\.\w{2,})?$/ /* Multiple TLDs */
/* Addresses */
/^\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Boulevard|Blvd)\.?$/i
/^P\.?O\.?\s+Box\s+\d+$/i /* PO Box 1234 */
/^\d{5}(?:-\d{4})?$/ /* ZIP: 12345 or 12345-6789 */
/^[A-Z]\d[A-Z]\s?\d[A-Z]\d$/ /* Canadian Postal: A1B 2C3 *//* Credit Card Numbers */
/^4\d{15}$/ /* Visa: 16 digits starting with 4 */
/^5[1-5]\d{14}$/ /* Mastercard: 16 digits starting 51-55 */
/^3[47]\d{13}$/ /* Amex: 15 digits starting 34/37 */
/^6(?:011|5\d{2})\d{12}$/ /* Discover: 16 digits */
/^35(?:2[89]|[3-8]\d)\d{12}$/ /* JCB */
/* Bank Account Numbers */
/^\d{8,17}$/ /* US Bank Account */
/^\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}$/ /* IBAN */
/^[A-Z]{4}[A-Z]{2}\d{2}[A-Z0-9]{12}$/ /* SWIFT/BIC */
/* Currency Amounts */
/^\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?$/ /* USD: $1,234.56 */
/^€\d{1,3}(?:\.\d{3})*(?:,\d{2})?$/ /* EUR: €1.234,56 */
/^£\d{1,3}(?:,\d{3})*(?:\.\d{2})?$/ /* GBP: £1,234.56 */
/^¥\d{1,3}(?:,\d{3})*$/ /* JPY: ¥1,234 */
/^\d+\.\d{2}$/ /* Decimal: 1234.56 */
/* Investment/Trading */
/^[A-Z]{1,5}$/ /* Stock Symbol: AAPL */
/^[A-Z]{6}\d{2}[A-Z]\d{8}$/ /* CUSIP */
/^[A-Z0-9]{12}$/ /* ISIN *//* Medical Record Numbers */
/^MR\d{6,8}$/ /* MR1234567 */
/^\d{7,10}$/ /* 1234567890 */
/^[A-Z]{2}\d{6}$/ /* AB123456 */
/* Drug Codes and Identifiers */
/^\d{4}-\d{4}-\d{2}$/ /* NDC: 1234-5678-90 */
/^\d{5}-\d{3}-\d{2}$/ /* NDC: 12345-678-90 */
/^\d{5}-\d{4}-\d{1}$/ /* NDC: 12345-6789-0 */
/^[A-Z]\d{2}(?:\.\d{1,2})?$/ /* ICD-10: A12.34 */
/^\d{3}(?:\.\d{2})?$/ /* ICD-9: 123.45 */
/^\d{5}$/ /* CPT: 12345 */
/* Lab Values and Measurements */
/^[<>=≤≥]\s?\d+\.?\d*$/ /* <5.0, >=10.2 */
/^\d+\.?\d*\s?-\s?\d+\.?\d*$/ /* Range: 5.0-10.5 */
/^(?:NEG|POS|NEGATIVE|POSITIVE)$/i /* Qualitative results */
/^(?:NORMAL|ABNORMAL|HIGH|LOW)$/i /* Reference indicators */
/* Dosage and Administration */
/^\d+\.?\d*\s?(mg|mcg|g|ml|units?|IU)$/i /* 500mg, 10.5mcg */
/^(?:QD|BID|TID|QID|Q\d+H)$/i /* Frequency: BID, Q8H */
/^(?:PO|IV|IM|SC|SL|PR|TOP)$/i /* Route: PO, IV *//* ISO 8601 Formats */
/^\d{4}-\d{2}-\d{2}$/ /* YYYY-MM-DD */
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/ /* YYYY-MM-DDTHH:MM:SS */
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/ /* UTC */
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}$/ /* Timezone */
/* US Date Formats */
/^\d{1,2}\/\d{1,2}\/\d{4}$/ /* M/D/YYYY or MM/DD/YY */
/^\d{1,2}-\d{1,2}-\d{4}$/ /* M-D-YYYY */
/^\d{2}\/\d{2}\/\d{2}$/ /* MM/DD/YY */
/* European Date Formats */
/^\d{1,2}\/\d{1,2}\/\d{4}$/ /* D/M/YYYY */
/^\d{2}\.\d{2}\.\d{4}$/ /* DD.MM.YYYY (German) */
/* Time Formats */
/^\d{1,2}:\d{2}$/ /* H:MM or HH:MM */
/^\d{1,2}:\d{2}:\d{2}$/ /* H:MM:SS */
/^\d{1,2}:\d{2}\s?(?:AM|PM)$/i /* 3:45 PM */
/^(?:[01]\d|2[0-3]):\d{2}:\d{2}$/ /* 24-hour: 23:59:59 */
/* Relative Time */
/^\d+\s?(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s?ago$/i
/^in\s?\d+\s?(seconds?|minutes?|hours?|days?|weeks?|months?|years?)$/i
/^(?:today|yesterday|tomorrow)$/i/* IP Addresses */
/^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$/ /* IPv4: 192.168.1.1 */
/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/ /* Valid IPv4 */
/^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$/ /* IPv6 full */
/^([0-9a-fA-F]{1,4}:){1,7}:$/ /* IPv6 compressed */
/* MAC Addresses */
/^[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}$/
/^[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}-[0-9A-Fa-f]{2}$/
/* URLs and Domains */
/^https?:\/\/[\w.-]+\.\w{2,}(?:\/[\w.-]*)*\/?$/ /* Basic URL */
/^https?:\/\/(?:www\.)?[\w.-]+\.\w{2,}(?:\/[\w./?#-]*)?$/ /* With www optional */
/^ftp:\/\/[\w.-]+\.\w{2,}(?:\/[\w.-]*)*\/?$/ /* FTP */
/^[\w.-]+\.\w{2,}$/ /* Domain only */
/* File Paths */
/^[A-Za-z]:\\(?:[^\\/:*?"<>|]+\\)*[^\\/:*?"<>|]*$/ /* Windows: C:\path\file.txt */
/^\/(?:[^\/]+\/)*[^\/]*$/ /* Unix/Linux: /path/file.txt */
/^~\/(?:[^\/]+\/)*[^\/]*$/ /* Home directory: ~/file.txt */
/* Database Identifiers */
/^[a-zA-Z_][a-zA-Z0-9_]*$/ /* Valid SQL identifier */
/^`[^`]+`$/ /* MySQL quoted identifier */
/^\[[^\]]+\]$/ /* SQL Server bracketed *//* File Extensions */
/\.(?:txt|csv|xlsx?|docx?|pdf|pptx?)$/i /* Office documents */
/\.(?:jpg|jpeg|png|gif|bmp|svg)$/i /* Images */
/\.(?:mp4|avi|mov|wmv|flv|mkv)$/i /* Videos */
/\.(?:mp3|wav|flac|aac|ogg)$/i /* Audio */
/\.(?:zip|rar|7z|tar|gz)$/i /* Archives */
/* Version Numbers */
/^v?\d+\.\d+\.\d+$/ /* Semantic: v1.2.3 */
/^v?\d+\.\d+\.\d+\.\d+$/ /* Four part: 1.2.3.4 */
/^v?\d+\.\d+\.\d+-(?:alpha|beta|rc)\d*$/i /* Pre-release */
/* License Plates (US) */
/^[A-Z0-9]{2,8}$/ /* General format */
/^[A-Z]{3}\s?\d{3}$/ /* ABC 123 */
/^\d{3}\s?[A-Z]{3}$/ /* 123 ABC */
/^[A-Z]{2}\s?\d{4}$/ /* AB 1234 *//* SKU/Product Codes */
/^[A-Z0-9]{6,12}$/ /* Alphanumeric SKU */
/^[A-Z]{2,4}-\d{4,8}$/ /* ABC-12345 */
/^\d{12}$/ /* UPC-A */
/^\d{8}$/ /* UPC-E */
/^\d{13}$/ /* EAN-13 */
/* Invoice/Order Numbers */
/^INV-\d{6,8}$/ /* INV-123456 */
/^PO-\d{4}-\d{4}$/ /* PO-2023-1234 */
/^ORD\d{8}$/ /* ORD12345678 */
/* Company Identifiers */
/^\d{2}-\d{7}$/ /* EIN: 12-3456789 */
/^[A-Z]{1,4}\d{4,6}$/ /* Stock ticker with number */
/^D-U-N-S\s?\d{9}$/ /* DUNS number *//* Chemical Formulas */
/^[A-Z][a-z]?\d*(?:[A-Z][a-z]?\d*)*$/ /* H2O, C6H12O6 */
/^[A-Z][a-z]?(?:\d+)?(?:\([A-Z][a-z]?(?:\d+)?\)\d*)*$/ /* Ca(OH)2 */
/* Gene/Protein Names */
/^[A-Z]{3,5}\d*$/ /* BRCA1, TP53 */
/^[A-Z][a-z]{2,10}$/ /* Insulin, Myosin */
/* PubMed IDs */
/^\d{8}$/ /* 12345678 */
/^PMID:\s?\d{8}$/ /* PMID: 12345678 */
/* DOI */
/^10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+$/ /* 10.1000/xyz123 */
/^doi:10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+$/i /* doi:10.1000/xyz123 */
/* Coordinates */
/^-?\d{1,3}\.\d+,\s?-?\d{1,3}\.\d+$/ /* Lat,Long: 40.7128,-74.0060 */
/^-?(?:180(?:\.0+)?|(?:1[0-7]\d|\d{1,2})(?:\.\d+)?)$/ /* Valid longitude */
/^-?(?:90(?:\.0+)?|[1-8]?\d(?:\.\d+)?)$/ /* Valid latitude *//* Log Levels */
/^\[?(DEBUG|INFO|WARN|ERROR|FATAL)\]?/i /* [INFO] message */
/^\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}\.\d{3}/ /* Timestamp with ms */
/* HTTP Status Codes */
/^[1-5]\d{2}$/ /* 200, 404, 500 */
/^2\d{2}$/ /* Success 2xx */
/^4\d{2}$/ /* Client error 4xx */
/^5\d{2}$/ /* Server error 5xx */
/* Server Response Times */
/^\d+(?:\.\d+)?ms$/ /* 123.45ms */
/^\d+(?:\.\d+)?s$/ /* 1.23s */
/* Memory/Storage Sizes */
/^\d+(?:\.\d+)?\s?(?:B|KB|MB|GB|TB|PB)$/i /* 1.5 GB */
/^\d+(?:\.\d+)?\s?(?:bytes?|kilo|mega|giga|tera)$/i/* Social Media Handles */
/^@[A-Za-z0-9_]{1,15}$/ /* Twitter: @username */
/^#[A-Za-z0-9_]+$/ /* Hashtag: #hashtag */
/^\/u\/[A-Za-z0-9_-]{3,20}$/ /* Reddit: /u/username */
/* Modern Web Identifiers */
/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i /* UUID */
/^[A-Za-z0-9_-]{22}$/ /* YouTube video ID */
/^[A-Za-z0-9_-]{11}$/ /* Short YouTube ID */
/* API Keys */
/^[A-Za-z0-9]{32}$/ /* 32 char API key */
/^[A-Za-z0-9]{40}$/ /* 40 char API key */
/^[A-Za-z0-9_-]{43}$/ /* JWT-like format *//* Empty/Whitespace */
/^\s*$/ /* Empty or whitespace only */
/^\s+/ /* Leading whitespace */
/\s+$/ /* Trailing whitespace */
/\s{2,}/ /* Multiple consecutive spaces */
/* Special Characters */
/[^\x00-\x7F]/ /* Non-ASCII characters */
/[^\w\s]/ /* Non-alphanumeric, non-space */
/[\x00-\x1F\x7F]/ /* Control characters */
/* Data Type Validation */
/^-?\d+$/ /* Integers */
/^-?\d*\.?\d+$/ /* Numbers (int or decimal) */
/^[+-]?\d*\.?\d+(?:[eE][+-]?\d+)?$/ /* Scientific notation */
/^(?:true|false)$/i /* Boolean */
/^(?:yes|no|y|n)$/i /* Yes/No variants */
/* Length Validation */
/^.{8,}$/ /* Minimum 8 characters */
/^.{0,255}$/ /* Maximum 255 characters */
/^.{8,128}$/ /* Between 8-128 characters *//* Password Strength */
/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)[A-Za-z\d@$!%*?&]{8,}$/ /* Strong password */
/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{12,}$/ /* Very strong */
/^[A-Za-z\d@$!%*?&]{6,20}$/ /* Basic password */
/* Security Tokens */
/^[A-Za-z0-9+\/]{64}=?=?$/ /* Base64 token */
/^[0-9a-f]{64}$/ /* SHA-256 hash */
/^[0-9a-f]{32}$/ /* MD5 hash *//^[A-Z]{2}\d{3,4}$/ /* Flight number: AA1234 */
/^[A-Z]{3}$/ /* Airport code: JFK */
/^N\d{1,5}[A-Z]{0,2}$/ /* Aircraft tail: N12345A *//^[A-Z]\d{2}\.[A-Z0-9]{1,4}$/ /* ICD-10-CM: A12.123 */
/^\d{5}-\d{4}-\d{2}$/ /* HCPCS: 12345-1234-12 */
/^[A-Z]{2}\d{8}$/ /* DEA number: AB1234567 *//^\d{2}-cv-\d{5}$/ /* Court case: 21-cv-12345 */
/^USC\s?\d{2}\s?§\s?\d+$/ /* US Code: USC 42 § 1983 */
/^CFR\s?\d{2}\s?\d+\.\d+$/ /* Code of Federal Regulations *//^\d{4}-\d{4}$/ /* Academic year: 2023-2024 */
/^[A-Z]{2,4}\s?\d{3}$/ /* Course code: MATH 101 */
/^\d{9}$/ /* Student ID: 123456789 */PRXPARSE() - Compile regex patternPRXMATCH() - Find pattern positionPRXCHANGE() - Replace patternsPRXPOSN() - Extract matched groupsPRXPAREN() - Number of capture groupsPRXNEXT() - Find next match